// This file is part of VkFFT, a Vulkan Fast Fourier Transform library // // Copyright (C) 2020 - present Dmitrii Tolmachev // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef VKFFT_H #define VKFFT_H #include #include #include #include #include #include #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif #include #if(VKFFT_BACKEND==0) #include "vulkan/vulkan.h" #include "glslang_c_interface.h" #elif(VKFFT_BACKEND==1) #include #include #include #include #include #elif(VKFFT_BACKEND==2) #include #include #include #include #elif(VKFFT_BACKEND==3) #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #endif #ifdef __APPLE__ #include #else #include #endif #elif(VKFFT_BACKEND==4) #include #elif(VKFFT_BACKEND==5) #define NS_PRIVATE_IMPLEMENTATION #define CA_PRIVATE_IMPLEMENTATION #define MTL_PRIVATE_IMPLEMENTATION #include "Foundation/Foundation.hpp" #include "QuartzCore/QuartzCore.hpp" #include "Metal/Metal.hpp" #endif #ifdef VkFFT_use_FP128_Bluestein_RaderFFT #include "fftw3.h" #endif typedef struct { //WHDCN layout //required parameters: uint64_t FFTdim; //FFT dimensionality (1, 2 or 3) uint64_t size[3]; // WHD -system dimensions #if(VKFFT_BACKEND==0) VkPhysicalDevice* physicalDevice;//pointer to Vulkan physical device, obtained from vkEnumeratePhysicalDevices VkDevice* device;//pointer to Vulkan device, created with vkCreateDevice VkQueue* queue;//pointer to Vulkan queue, created with vkGetDeviceQueue VkCommandPool* commandPool;//pointer to Vulkan command pool, created with vkCreateCommandPool VkFence* fence;//pointer to Vulkan fence, created with vkCreateFence uint64_t isCompilerInitialized;//specify if glslang compiler has been intialized before (0 - off, 1 - on). Default 0 #elif(VKFFT_BACKEND==1) CUdevice* device;//pointer to CUDA device, obtained from cuDeviceGet //CUcontext* context;//pointer to CUDA context, obtained from cuDeviceGet cudaStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels uint64_t num_streams;//try to submit CUDA kernels in multiple streams for asynchronous execution. Default 1 #elif(VKFFT_BACKEND==2) hipDevice_t* device;//pointer to HIP device, obtained from hipDeviceGet //hipCtx_t* context;//pointer to HIP context, obtained from hipDeviceGet hipStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels uint64_t num_streams;//try to submit HIP kernels in multiple streams for asynchronous execution. Default 1 #elif(VKFFT_BACKEND==3) cl_platform_id* platform;//not required cl_device_id* device; cl_context* context; #elif(VKFFT_BACKEND==4) ze_device_handle_t* device; ze_context_handle_t* context; ze_command_queue_handle_t* commandQueue; uint32_t commandQueueID; #elif(VKFFT_BACKEND==5) MTL::Device* device; MTL::CommandQueue* queue; #endif //data parameters: uint64_t userTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation (0 - off, 1 - on) uint64_t bufferNum;//multiple buffer sequence storage is Vulkan only. Default 1 uint64_t tempBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation uint64_t inputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isInputFormatted is enabled uint64_t outputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isOutputFormatted is enabled uint64_t kernelNum;//multiple buffer sequence storage is Vulkan only. Default 1, if performConvolution is enabled //sizes are obligatory in Vulkan backend, optional in others uint64_t* bufferSize;//array of buffers sizes in bytes uint64_t* tempBufferSize;//array of temp buffers sizes in bytes. Default set to bufferSize sum, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation uint64_t* inputBufferSize;//array of input buffers sizes in bytes, if isInputFormatted is enabled uint64_t* outputBufferSize;//array of output buffers sizes in bytes, if isOutputFormatted is enabled uint64_t* kernelSize;//array of kernel buffers sizes in bytes, if performConvolution is enabled #if(VKFFT_BACKEND==0) VkBuffer* buffer;//pointer to array of buffers (or one buffer) used for computations VkBuffer* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation VkBuffer* inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled VkBuffer* outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled VkBuffer* kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==1) void** buffer;//pointer to device buffer used for computations void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==2) void** buffer;//pointer to device buffer used for computations void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==3) cl_mem* buffer;//pointer to device buffer used for computations cl_mem* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation cl_mem* inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled cl_mem* outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled cl_mem* kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==4) void** buffer;//pointer to device buffer used for computations void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==5) MTL::Buffer** buffer;//pointer to device buffer used for computations MTL::Buffer** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation MTL::Buffer** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled MTL::Buffer** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled MTL::Buffer** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #endif uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0 uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0 uint64_t specifyOffsetsAtLaunch;//specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0 //optional: (default 0 if not stated otherwise) #if(VKFFT_BACKEND==0) VkPipelineCache* pipelineCache;//pointer to Vulkan pipeline cache #endif uint64_t coalescedMemory;//in bytes, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Gonna work regardles, but if specified by user correctly, the performance will be higher. uint64_t aimThreads;//aim at this many threads per block. Default 128 uint64_t numSharedBanks;//how many banks shared memory has. Default 32 uint64_t inverseReturnToInputBuffer;//return data to the input buffer in inverse transform (0 - off, 1 - on). isInputFormatted must be enabled uint64_t numberBatches;// N - used to perform multiple batches of initial data. Default 1 uint64_t useUint64;// use 64-bit addressing mode in generated kernels uint64_t omitDimension[3];//disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C dimension 0 for now. Doesn't work with convolutions. uint64_t performBandwidthBoost;//try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise uint64_t doublePrecision; //perform calculations in double precision (0 - off, 1 - on). uint64_t halfPrecision; //perform calculations in half precision (0 - off, 1 - on) uint64_t halfPrecisionMemoryOnly; //use half precision only as input/output buffer. Input/Output have to be allocated as half, buffer/tempBuffer have to be allocated as float (out of place mode only). Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on) uint64_t doublePrecisionFloatMemory; //use FP64 precision for all calculations, while all memory storage is done in FP32. uint64_t performR2C; //perform R2C/C2R decomposition (0 - off, 1 - on) uint64_t performDCT; //perform DCT transformation (X - DCT type, 1-4) uint64_t disableMergeSequencesR2C; //disable merging of two real sequences to reduce calculations (0 - off, 1 - on) uint64_t normalize; //normalize inverse transform (0 - off, 1 - on) uint64_t disableReorderFourStep; // disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on) int64_t useLUT; //switches from calculating sincos to using precomputed LUT tables (-1 - off, 0 - auto, 1 - on). Configured by initialization routine int64_t useLUT_4step; //switches from calculating sincos to using precomputed LUT tables for intermediate roots of 1 in the Four-step FFT algorithm. (-1 - off, 0 - auto, 1 - on). Configured by initialization routine uint64_t makeForwardPlanOnly; //generate code only for forward FFT (0 - off, 1 - on) uint64_t makeInversePlanOnly; //generate code only for inverse FFT (0 - off, 1 - on) uint64_t bufferStride[3];//buffer strides - default set to x - x*y - x*y*z values uint64_t isInputFormatted; //specify if input buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1) uint64_t isOutputFormatted; //specify if output buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1) uint64_t inputBufferStride[3];//input buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values uint64_t outputBufferStride[3];//output buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values uint64_t considerAllAxesStrided;//will create plan for nonstrided axis similar as a strided axis - used with disableReorderFourStep to get the same layout for Bluestein kernel (0 - off, 1 - on) uint64_t keepShaderCode;//will keep shader code and print all executed shaders during the plan execution in order (0 - off, 1 - on) uint64_t printMemoryLayout;//will print order of buffers used in shaders (0 - off, 1 - on) uint64_t saveApplicationToString;//will save all compiled binaries to VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated with deleteVkFFT call). Currently disabled in Metal backend. (0 - off, 1 - on) uint64_t loadApplicationFromString;//will load all binaries from loadApplicationString instead of recompiling them (must be allocated by user, must contain what saveApplicationToString call generated previously in VkFFTApplication.saveApplicationString). Currently disabled in Metal backend. (0 - off, 1 - on). Mutually exclusive with saveApplicationToString void* loadApplicationString;//memory binary array through which user can load VkFFT binaries, must be provided by user if loadApplicationFromString = 1. Use rb/wb flags to load/save. uint64_t disableSetLocale;//disables all VkFFT attempts to set locale to C - user must ensure that VkFFT has C locale during the plan initialization. This option is needed for multithreading. Default 0. //optional Bluestein optimizations: (default 0 if not stated otherwise) uint64_t fixMaxRadixBluestein;//controls the padding of sequences in Bluestein convolution. If specified, padded sequence will be made of up to fixMaxRadixBluestein primes. Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension FFT system, 7 for Vulkan/OpenCL/HIP past after. Min = 2, Max = 13. uint64_t forceBluesteinSequenceSize;// force the sequence size to pad to in Bluestein's algorithm. Must be at least 2*N-1 and decomposable with primes 2-13. uint64_t useCustomBluesteinPaddingPattern;// force the sequence sizes to pad to in Bluestein's algorithm, but on a range. This number specifies the number of elements in primeSizes and in paddedSizes arrays. primeSizes - array of non-decomposable as radix scheme sizes - 17, 23, 31 etc. // paddedSizes - array of lengths to pad to. paddedSizes[i] will be the padding size for all non-decomposable sequences from primeSizes[i] to primeSizes[i+1] (will use default scheme after last one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme (for example). Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern. uint64_t* primeSizes; // described in useCustomBluesteinPaddingPattern uint64_t* paddedSizes; // described in useCustomBluesteinPaddingPattern uint64_t fixMinRaderPrimeMult;//start direct multiplication Rader's algorithm for radix primes from this number. This means that VkFFT will inline custom Rader kernels if sequence is divisible by these primes. Default is 17, as VkFFT has kernels for 2-13. If you make it less than 13, VkFFT will switch from these kernels to Rader. uint64_t fixMaxRaderPrimeMult;//switch from Mult Rader's algorithm for radix primes from this number. Current limitation for Rader is maxThreadNum/2+1, realistically you would want to switch somewhere on 30-100 range. Default is vendor-specific (currently ~40) uint64_t fixMinRaderPrimeFFT;//start FFT convolution version of Rader for radix primes from this number. Better than direct multiplication version for almost all primes (except small ones, like 17-23 on some GPUs). Must be bigger or equal to fixMinRaderPrimeMult. Deafult 29 on AMD and 17 on other GPUs. uint64_t fixMaxRaderPrimeFFT;//switch to Bluestein's algorithm for radix primes from this number. Switch may happen earlier if prime can't fit in shared memory. Default is 16384, which is bigger than most current GPU's shared memory. //optional zero padding control parameters: (default 0 if not stated otherwise) uint64_t performZeropadding[3]; // don't read some data/perform computations if some input sequences are zeropadded for each axis (0 - off, 1 - on) uint64_t fft_zeropad_left[3];//specify start boundary of zero block in the system for each axis uint64_t fft_zeropad_right[3];//specify end boundary of zero block in the system for each axis uint64_t frequencyZeroPadding; //set to 1 if zeropadding of frequency domain, default 0 - spatial zeropadding //optional convolution control parameters: (default 0 if not stated otherwise) uint64_t performConvolution; //perform convolution in this application (0 - off, 1 - on). Disables reorderFourStep parameter uint64_t conjugateConvolution;//0 off, 1 - conjugation of the sequence FFT is currently done on, 2 - conjugation of the convolution kernel uint64_t crossPowerSpectrumNormalization;//normalize the FFT x kernel multiplication in frequency domain uint64_t coordinateFeatures; // C - coordinate, or dimension of features vector. In matrix convolution - size of vector uint64_t matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures uint64_t symmetricKernel; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric uint64_t numberKernels;// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output uint64_t kernelConvolution;// specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation //register overutilization (experimental): (default 0 if not stated otherwise) uint64_t registerBoost; //specify if register file size is bigger than shared memory and can be used to extend it X times (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4 to emulate 128KB of shared memory). Default 1 uint64_t registerBoostNonPow2; //specify if register overutilization should be used on non power of 2 sequences (0 - off, 1 - on) uint64_t registerBoost4Step; //specify if register file overutilization should be used in big sequences (>2^14), same definition as registerBoost. Default 1 //not used techniques: uint64_t swapTo3Stage4Step; //specify at which number to switch from 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. Must be at least 131072 uint64_t devicePageSize;//in KB, the size of a page on the GPU. Setting to 0 disables local buffer split in pages uint64_t localPageSize;//in KB, the size to split page into if sequence spans multiple devicePageSize pages //automatically filled based on device info (still can be reconfigured by user): uint64_t computeCapabilityMajor; // CUDA/HIP compute capability of the device uint64_t computeCapabilityMinor; // CUDA/HIP compute capability of the device uint64_t maxComputeWorkGroupCount[3]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits uint64_t maxComputeWorkGroupSize[3]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits uint64_t maxThreadsNum; //max number of threads from VkPhysicalDeviceLimits uint64_t sharedMemorySizeStatic; //available for static allocation shared memory size, in bytes uint64_t sharedMemorySize; //available for allocation shared memory size, in bytes uint64_t sharedMemorySizePow2; //power of 2 which is less or equal to sharedMemorySize, in bytes uint64_t warpSize; //number of threads per warp/wavefront. uint64_t halfThreads;//Intel fix uint64_t allocateTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Parameter to check if it has been allocated uint64_t reorderFourStep; // unshuffle Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on). Default 1. int64_t maxCodeLength; //specify how big can be buffer used for code generation (in char). Default 4000000 chars. int64_t maxTempLength; //specify how big can be buffer used for intermediate string sprintfs be (in char). Default 5000 chars. If code segfaults for some reason - try increasing this number. uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluesteinPaddingPattern uint64_t useRaderUintLUT; // allocate additional LUT to store g_pow uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD, etc. #if(VKFFT_BACKEND==0) VkDeviceMemory tempBufferDeviceMemory;//Filled at app creation VkCommandBuffer* commandBuffer;//Filled at app execution VkMemoryBarrier* memory_barrier;//Filled at app creation #elif(VKFFT_BACKEND==1) cudaEvent_t* stream_event;//Filled at app creation uint64_t streamCounter;//Filled at app creation uint64_t streamID;//Filled at app creation #elif(VKFFT_BACKEND==2) hipEvent_t* stream_event;//Filled at app creation uint64_t streamCounter;//Filled at app creation uint64_t streamID;//Filled at app creation int64_t useStrict32BitAddress; // guarantee 32 bit addresses in bytes instead of number of elements. This results in fewer instructions generated. -1: Disable, 0: Infer based on size, 1: enable. Has no effect with useUint64. #elif(VKFFT_BACKEND==3) cl_command_queue* commandQueue; #elif(VKFFT_BACKEND==4) ze_command_list_handle_t* commandList;//Filled at app execution #elif(VKFFT_BACKEND==5) MTL::CommandBuffer* commandBuffer;//Filled at app execution MTL::ComputeCommandEncoder* commandEncoder;//Filled at app execution #endif } VkFFTConfiguration;//parameters specified at plan creation typedef struct { #if(VKFFT_BACKEND==0) VkCommandBuffer* commandBuffer;//commandBuffer to which FFT is appended VkBuffer* buffer;//pointer to array of buffers (or one buffer) used for computations VkBuffer* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation VkBuffer* inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled VkBuffer* outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled VkBuffer* kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==1) void** buffer;//pointer to device buffer used for computations void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==2) void** buffer;//pointer to device buffer used for computations void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==3) cl_command_queue* commandQueue;//commandBuffer to which FFT is appended cl_mem* buffer;//pointer to device buffer used for computations cl_mem* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation cl_mem* inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled cl_mem* outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled cl_mem* kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==4) ze_command_list_handle_t* commandList;//commandList to which FFT is appended void** buffer;//pointer to device buffer used for computations void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled #elif(VKFFT_BACKEND==5) MTL::CommandBuffer* commandBuffer;//commandBuffer to which FFT is appended MTL::ComputeCommandEncoder* commandEncoder;//encoder associated with commandBuffer MTL::Buffer** buffer;//pointer to array of buffers (or one buffer) used for computations MTL::Buffer** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation MTL::Buffer** inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled MTL::Buffer** outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled MTL::Buffer** kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled #endif //following parameters can be specified during kernels launch, if specifyOffsetsAtLaunch parameter was enabled during the initializeVkFFT call uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0 uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0 } VkFFTLaunchParams;//parameters specified at plan execution typedef enum VkFFTResult { VKFFT_SUCCESS = 0, VKFFT_ERROR_MALLOC_FAILED = 1, VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER = 2, VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER = 3, VKFFT_ERROR_PLAN_NOT_INITIALIZED = 4, VKFFT_ERROR_NULL_TEMP_PASSED = 5, VKFFT_ERROR_INVALID_PHYSICAL_DEVICE = 1001, VKFFT_ERROR_INVALID_DEVICE = 1002, VKFFT_ERROR_INVALID_QUEUE = 1003, VKFFT_ERROR_INVALID_COMMAND_POOL = 1004, VKFFT_ERROR_INVALID_FENCE = 1005, VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED = 1006, VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED = 1007, VKFFT_ERROR_INVALID_CONTEXT = 1008, VKFFT_ERROR_INVALID_PLATFORM = 1009, VKFFT_ERROR_ENABLED_saveApplicationToString = 1010, VKFFT_ERROR_EMPTY_FILE = 1011, VKFFT_ERROR_EMPTY_FFTdim = 2001, VKFFT_ERROR_EMPTY_size = 2002, VKFFT_ERROR_EMPTY_bufferSize = 2003, VKFFT_ERROR_EMPTY_buffer = 2004, VKFFT_ERROR_EMPTY_tempBufferSize = 2005, VKFFT_ERROR_EMPTY_tempBuffer = 2006, VKFFT_ERROR_EMPTY_inputBufferSize = 2007, VKFFT_ERROR_EMPTY_inputBuffer = 2008, VKFFT_ERROR_EMPTY_outputBufferSize = 2009, VKFFT_ERROR_EMPTY_outputBuffer = 2010, VKFFT_ERROR_EMPTY_kernelSize = 2011, VKFFT_ERROR_EMPTY_kernel = 2012, VKFFT_ERROR_EMPTY_applicationString = 2013, VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays = 2014, VKFFT_ERROR_UNSUPPORTED_RADIX = 3001, VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002, VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003, VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT = 3004, VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005, VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001, VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002, VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS = 4003, VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER = 4004, VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER = 4005, VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE = 4006, VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES = 4007, VKFFT_ERROR_FAILED_TO_RESET_FENCES = 4008, VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL = 4009, VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT = 4010, VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS = 4011, VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT = 4012, VKFFT_ERROR_FAILED_SHADER_PREPROCESS = 4013, VKFFT_ERROR_FAILED_SHADER_PARSE = 4014, VKFFT_ERROR_FAILED_SHADER_LINK = 4015, VKFFT_ERROR_FAILED_SPIRV_GENERATE = 4016, VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE = 4017, VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE = 4018, VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER = 4019, VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE = 4020, VKFFT_ERROR_FAILED_TO_CREATE_DEVICE = 4021, VKFFT_ERROR_FAILED_TO_CREATE_FENCE = 4022, VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL = 4023, VKFFT_ERROR_FAILED_TO_CREATE_BUFFER = 4024, VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY = 4025, VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY = 4026, VKFFT_ERROR_FAILED_TO_FIND_MEMORY = 4027, VKFFT_ERROR_FAILED_TO_SYNCHRONIZE = 4028, VKFFT_ERROR_FAILED_TO_COPY = 4029, VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM = 4030, VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM = 4031, VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE = 4032, VKFFT_ERROR_FAILED_TO_GET_CODE = 4033, VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM = 4034, VKFFT_ERROR_FAILED_TO_LOAD_MODULE = 4035, VKFFT_ERROR_FAILED_TO_GET_FUNCTION = 4036, VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY = 4037, VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL = 4038, VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL = 4039, VKFFT_ERROR_FAILED_TO_EVENT_RECORD = 4040, VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION = 4041, VKFFT_ERROR_FAILED_TO_INITIALIZE = 4042, VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID = 4043, VKFFT_ERROR_FAILED_TO_GET_DEVICE = 4044, VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT = 4045, VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE = 4046, VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG = 4047, VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE = 4048, VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE = 4049, VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES = 4050, VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE = 4051, VKFFT_ERROR_FAILED_TO_CREATE_EVENT = 4052, VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST = 4053, VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST = 4054, VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER = 4055 } VkFFTResult; static inline const char* getVkFFTErrorString(VkFFTResult result) { switch (result) { case VKFFT_SUCCESS: return "VKFFT_SUCCESS"; case VKFFT_ERROR_MALLOC_FAILED: return "VKFFT_ERROR_MALLOC_FAILED"; case VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER: return "VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER"; case VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER: return "VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER"; case VKFFT_ERROR_PLAN_NOT_INITIALIZED: return "VKFFT_ERROR_PLAN_NOT_INITIALIZED"; case VKFFT_ERROR_NULL_TEMP_PASSED: return "VKFFT_ERROR_NULL_TEMP_PASSED"; case VKFFT_ERROR_INVALID_PHYSICAL_DEVICE: return "VKFFT_ERROR_INVALID_PHYSICAL_DEVICE"; case VKFFT_ERROR_INVALID_DEVICE: return "VKFFT_ERROR_INVALID_DEVICE"; case VKFFT_ERROR_INVALID_QUEUE: return "VKFFT_ERROR_INVALID_QUEUE"; case VKFFT_ERROR_INVALID_COMMAND_POOL: return "VKFFT_ERROR_INVALID_COMMAND_POOL"; case VKFFT_ERROR_INVALID_FENCE: return "VKFFT_ERROR_INVALID_FENCE"; case VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED: return "VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED"; case VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED: return "VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED"; case VKFFT_ERROR_INVALID_CONTEXT: return "VKFFT_ERROR_INVALID_CONTEXT"; case VKFFT_ERROR_INVALID_PLATFORM: return "VKFFT_ERROR_INVALID_PLATFORM"; case VKFFT_ERROR_ENABLED_saveApplicationToString: return "VKFFT_ERROR_ENABLED_saveApplicationToString"; case VKFFT_ERROR_EMPTY_FILE: return "VKFFT_ERROR_EMPTY_FILE"; case VKFFT_ERROR_EMPTY_FFTdim: return "VKFFT_ERROR_EMPTY_FFTdim"; case VKFFT_ERROR_EMPTY_size: return "VKFFT_ERROR_EMPTY_size"; case VKFFT_ERROR_EMPTY_bufferSize: return "VKFFT_ERROR_EMPTY_bufferSize"; case VKFFT_ERROR_EMPTY_buffer: return "VKFFT_ERROR_EMPTY_buffer"; case VKFFT_ERROR_EMPTY_tempBufferSize: return "VKFFT_ERROR_EMPTY_tempBufferSize"; case VKFFT_ERROR_EMPTY_tempBuffer: return "VKFFT_ERROR_EMPTY_tempBuffer"; case VKFFT_ERROR_EMPTY_inputBufferSize: return "VKFFT_ERROR_EMPTY_inputBufferSize"; case VKFFT_ERROR_EMPTY_inputBuffer: return "VKFFT_ERROR_EMPTY_inputBuffer"; case VKFFT_ERROR_EMPTY_outputBufferSize: return "VKFFT_ERROR_EMPTY_outputBufferSize"; case VKFFT_ERROR_EMPTY_outputBuffer: return "VKFFT_ERROR_EMPTY_outputBuffer"; case VKFFT_ERROR_EMPTY_kernelSize: return "VKFFT_ERROR_EMPTY_kernelSize"; case VKFFT_ERROR_EMPTY_kernel: return "VKFFT_ERROR_EMPTY_kernel"; case VKFFT_ERROR_EMPTY_applicationString: return "VKFFT_ERROR_EMPTY_applicationString"; case VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays: return "VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays"; case VKFFT_ERROR_UNSUPPORTED_RADIX: return "VKFFT_ERROR_UNSUPPORTED_RADIX"; case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH: return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH"; case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C: return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C"; case VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT: return "VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT"; case VKFFT_ERROR_UNSUPPORTED_FFT_OMIT: return "VKFFT_ERROR_UNSUPPORTED_FFT_OMIT"; case VKFFT_ERROR_FAILED_TO_ALLOCATE: return "VKFFT_ERROR_FAILED_TO_ALLOCATE"; case VKFFT_ERROR_FAILED_TO_MAP_MEMORY: return "VKFFT_ERROR_FAILED_TO_MAP_MEMORY"; case VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS: return "VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS"; case VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER: return "VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER"; case VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER: return "VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER"; case VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE: return "VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE"; case VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES: return "VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES"; case VKFFT_ERROR_FAILED_TO_RESET_FENCES: return "VKFFT_ERROR_FAILED_TO_RESET_FENCES"; case VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL: return "VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL"; case VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT: return "VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT"; case VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS: return "VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS"; case VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT: return "VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT"; case VKFFT_ERROR_FAILED_SHADER_PREPROCESS: return "VKFFT_ERROR_FAILED_SHADER_PREPROCESS"; case VKFFT_ERROR_FAILED_SHADER_PARSE: return "VKFFT_ERROR_FAILED_SHADER_PARSE"; case VKFFT_ERROR_FAILED_SHADER_LINK: return "VKFFT_ERROR_FAILED_SHADER_LINK"; case VKFFT_ERROR_FAILED_SPIRV_GENERATE: return "VKFFT_ERROR_FAILED_SPIRV_GENERATE"; case VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE: return "VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE"; case VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE: return "VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE"; case VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER: return "VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER"; case VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE: return "VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE"; case VKFFT_ERROR_FAILED_TO_CREATE_DEVICE: return "VKFFT_ERROR_FAILED_TO_CREATE_DEVICE"; case VKFFT_ERROR_FAILED_TO_CREATE_FENCE: return "VKFFT_ERROR_FAILED_TO_CREATE_FENCE"; case VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL: return "VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL"; case VKFFT_ERROR_FAILED_TO_CREATE_BUFFER: return "VKFFT_ERROR_FAILED_TO_CREATE_BUFFER"; case VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY: return "VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY"; case VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY: return "VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY"; case VKFFT_ERROR_FAILED_TO_FIND_MEMORY: return "VKFFT_ERROR_FAILED_TO_FIND_MEMORY"; case VKFFT_ERROR_FAILED_TO_SYNCHRONIZE: return "VKFFT_ERROR_FAILED_TO_SYNCHRONIZE"; case VKFFT_ERROR_FAILED_TO_COPY: return "VKFFT_ERROR_FAILED_TO_COPY"; case VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM: return "VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM"; case VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM: return "VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM"; case VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE: return "VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE"; case VKFFT_ERROR_FAILED_TO_GET_CODE: return "VKFFT_ERROR_FAILED_TO_GET_CODE"; case VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM: return "VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM"; case VKFFT_ERROR_FAILED_TO_LOAD_MODULE: return "VKFFT_ERROR_FAILED_TO_LOAD_MODULE"; case VKFFT_ERROR_FAILED_TO_GET_FUNCTION: return "VKFFT_ERROR_FAILED_TO_GET_FUNCTION"; case VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY: return "VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY"; case VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL: return "VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL"; case VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL: return "VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL"; case VKFFT_ERROR_FAILED_TO_EVENT_RECORD: return "VKFFT_ERROR_FAILED_TO_EVENT_RECORD"; case VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION: return "VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION"; case VKFFT_ERROR_FAILED_TO_INITIALIZE: return "VKFFT_ERROR_FAILED_TO_INITIALIZE"; case VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID: return "VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID"; case VKFFT_ERROR_FAILED_TO_GET_DEVICE: return "VKFFT_ERROR_FAILED_TO_GET_DEVICE"; case VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT: return "VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT"; case VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE: return "VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE"; case VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG: return "VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG"; case VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE: return "VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE"; case VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE: return "VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE"; case VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES: return "VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES"; case VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE: return "VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE"; case VKFFT_ERROR_FAILED_TO_CREATE_EVENT: return "VKFFT_ERROR_FAILED_TO_CREATE_EVENT"; case VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST: return "VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST"; case VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST: return "VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST"; case VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER: return "VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER"; } return "Unknown VkFFT error"; } typedef struct VkFFTRaderContainer VkFFTRaderContainer; struct VkFFTRaderContainer { uint64_t prime; uint64_t generator; uint64_t multiplier; uint64_t inline_rader_g_pow; uint64_t raderUintLUToffset; uint64_t type; //0 - FFT, 1 - Direct multiplication uint64_t raderRegisters; uint64_t rader_min_registers; //Direct multiplication parameters //FFT parameters uint64_t registers_per_thread; uint64_t min_registers_per_thread; uint64_t loc_multipliers[33]; uint64_t registers_per_thread_per_radix[33]; uint64_t stageRadix[20]; uint64_t numStages; uint64_t numSubPrimes; uint64_t stage_rader_generator[20]; uint64_t containerFFTDim; uint64_t containerFFTNum; uint64_t subLogicalGroupSizeMax;//how many threads are needed per Rader transform uint64_t RaderKernelOffsetLUT; uint64_t RaderRadixOffsetLUT; uint64_t RaderRadixOffsetLUTiFFT; void* raderFFTkernel; struct VkFFTRaderContainer* container; }; typedef struct { uint64_t size[3]; uint64_t localSize[3]; uint64_t numSubgroups; uint64_t sourceFFTSize; uint64_t fftDim; uint64_t precision; uint64_t inverse; uint64_t actualInverse; uint64_t inverseBluestein; uint64_t zeropad[2]; uint64_t zeropadBluestein[2]; uint64_t axis_id; uint64_t axis_upload_id; uint64_t numAxisUploads; uint64_t registers_per_thread; uint64_t registers_per_thread_per_radix[33]; uint64_t min_registers_per_thread; uint64_t maxNonPow2Radix; uint64_t usedLocRegs; uint64_t readToRegisters; uint64_t writeFromRegisters; uint64_t LUT; uint64_t LUT_4step; uint64_t raderUintLUT; uint64_t useCoalescedLUTUploadToSM; uint64_t useBluesteinFFT; uint64_t reverseBluesteinMultiUpload; uint64_t BluesteinConvolutionStep; uint64_t BluesteinPreMultiplication; uint64_t BluesteinPostMultiplication; uint64_t startDCT3LUT; uint64_t startDCT4LUT; uint64_t performR2C; uint64_t performR2CmultiUpload; uint64_t performDCT; uint64_t performBandwidthBoost; uint64_t frequencyZeropadding; uint64_t performZeropaddingFull[3]; // don't do read/write if full sequence is omitted uint64_t performZeropaddingInput[3]; // don't read if input is zeropadded (0 - off, 1 - on) uint64_t performZeropaddingOutput[3]; // don't write if output is zeropadded (0 - off, 1 - on) uint64_t fft_zeropad_left_full[3]; uint64_t fft_zeropad_left_read[3]; uint64_t fft_zeropad_left_write[3]; uint64_t fft_zeropad_right_full[3]; uint64_t fft_zeropad_right_read[3]; uint64_t fft_zeropad_right_write[3]; uint64_t fft_zeropad_Bluestein_left_read[3]; uint64_t fft_zeropad_Bluestein_left_write[3]; uint64_t fft_zeropad_Bluestein_right_read[3]; uint64_t fft_zeropad_Bluestein_right_write[3]; uint64_t inputStride[5]; uint64_t outputStride[5]; uint64_t fft_dim_full; uint64_t stageStartSize; uint64_t firstStageStartSize; uint64_t fft_dim_x; uint64_t dispatchZactualFFTSize; uint64_t numStages; uint64_t stageRadix[33]; uint64_t inputOffset; uint64_t kernelOffset; uint64_t outputOffset; uint64_t reorderFourStep; uint64_t pushConstantsStructSize; uint64_t performWorkGroupShift[3]; uint64_t performPostCompilationInputOffset; uint64_t performPostCompilationOutputOffset; uint64_t performPostCompilationKernelOffset; uint64_t inputBufferBlockNum; uint64_t inputBufferBlockSize; uint64_t outputBufferBlockNum; uint64_t outputBufferBlockSize; uint64_t kernelBlockNum; uint64_t kernelBlockSize; uint64_t numCoordinates; uint64_t matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures uint64_t numBatches; uint64_t numKernels; uint64_t conjugateConvolution; uint64_t crossPowerSpectrumNormalization; uint64_t usedSharedMemory; uint64_t sharedMemSize; uint64_t sharedMemSizePow2; uint64_t normalize; uint64_t complexSize; uint64_t inputNumberByteSize; uint64_t outputNumberByteSize; uint64_t kernelNumberByteSize; uint64_t maxStageSumLUT; uint64_t unroll; uint64_t swapComputeWorkGroupID; uint64_t convolutionStep; uint64_t symmetricKernel; uint64_t supportAxis; uint64_t cacheShuffle; uint64_t registerBoost; uint64_t warpSize; uint64_t numSharedBanks; uint64_t resolveBankConflictFirstStages; uint64_t sharedStrideBankConflictFirstStages; uint64_t sharedStrideReadWriteConflict; uint64_t sharedStrideRaderFFT; uint64_t sharedShiftRaderFFT; uint64_t maxSharedStride; uint64_t axisSwapped; uint64_t mergeSequencesR2C; uint64_t numBuffersBound[10]; uint64_t convolutionBindingID; uint64_t LUTBindingID; uint64_t BluesteinConvolutionBindingID; uint64_t BluesteinMultiplicationBindingID; uint64_t useRader; uint64_t numRaderPrimes; uint64_t minRaderFFTThreadNum; VkFFTRaderContainer* raderContainer; VkFFTRaderContainer* currentRaderContainer; uint64_t RaderUintLUTBindingID; uint64_t useRaderMult; uint64_t additionalRaderSharedSize; uint64_t RaderKernelOffsetShared[33]; uint64_t RaderKernelOffsetLUT[33]; uint64_t rader_generator[33]; uint64_t fixMinRaderPrimeMult;//start Rader algorithm for primes from this number uint64_t fixMaxRaderPrimeMult;//switch from Rader to Bluestein algorithm for primes from this number uint64_t fixMinRaderPrimeFFT;//start Rader algorithm for primes from this number uint64_t fixMaxRaderPrimeFFT;//switch from Rader to Bluestein algorithm for primes from this number uint64_t inline_rader_g_pow; uint64_t inline_rader_kernel; uint64_t raderRegisters; uint64_t rader_min_registers; uint64_t useRaderFFT; uint64_t performOffsetUpdate; uint64_t performBufferSetUpdate; uint64_t useUint64; #if(VKFFT_BACKEND==2) int64_t useStrict32BitAddress; #endif uint64_t disableSetLocale; char** regIDs; char* disableThreadsStart; char* disableThreadsEnd; char sdataID[50]; char inoutID[50]; char combinedID[50]; char raderIDx[50]; char raderIDx2[50]; char gl_LocalInvocationID_x[50]; char gl_LocalInvocationID_y[50]; char gl_LocalInvocationID_z[50]; char gl_GlobalInvocationID_x[200]; char gl_GlobalInvocationID_y[200]; char gl_GlobalInvocationID_z[200]; char gl_SubgroupInvocationID[200]; char gl_SubgroupID[200]; char tshuffle[50]; char sharedStride[50]; char gl_WorkGroupSize_x[50]; char gl_WorkGroupSize_y[50]; char gl_WorkGroupSize_z[50]; char gl_WorkGroupID_x[50]; char gl_WorkGroupID_y[50]; char gl_WorkGroupID_z[50]; char tempReg[50]; char vecType[30]; char stageInvocationID[50]; char blockInvocationID[50]; char temp[50]; char temp2[50]; char w[50]; char iw[50]; char x0[33][40]; char locID[33][40]; char* code0; char* output; char* tempStr; int64_t tempLen; int64_t currentLen; int64_t maxCodeLength; int64_t maxTempLength; char oldLocale[100]; } VkFFTSpecializationConstantsLayout; typedef struct { uint32_t dataUint32[10]; uint64_t dataUint64[10]; #if(VKFFT_BACKEND == 5) MTL::Buffer* dataUintBuffer; #endif //specify what can be in layout uint64_t performWorkGroupShift[3]; uint64_t workGroupShift[3]; uint64_t performPostCompilationInputOffset; uint64_t inputOffset; uint64_t performPostCompilationOutputOffset; uint64_t outputOffset; uint64_t performPostCompilationKernelOffset; uint64_t kernelOffset; uint64_t structSize; } VkFFTPushConstantsLayout; typedef struct { uint64_t numBindings; uint64_t axisBlock[4]; uint64_t groupedBatch; VkFFTSpecializationConstantsLayout specializationConstants; VkFFTPushConstantsLayout pushConstants; uint64_t updatePushConstants; #if(VKFFT_BACKEND==0) VkBuffer* inputBuffer; VkBuffer* outputBuffer; VkDescriptorPool descriptorPool; VkDescriptorSetLayout descriptorSetLayout; VkDescriptorSet descriptorSet; VkPipelineLayout pipelineLayout; VkPipeline pipeline; VkDeviceMemory bufferLUTDeviceMemory; VkBuffer bufferLUT; VkDeviceMemory bufferRaderUintLUTDeviceMemory; VkBuffer bufferRaderUintLUT; VkDeviceMemory* bufferBluesteinDeviceMemory; VkDeviceMemory* bufferBluesteinFFTDeviceMemory; VkBuffer* bufferBluestein; VkBuffer* bufferBluesteinFFT; #elif(VKFFT_BACKEND==1) void** inputBuffer; void** outputBuffer; CUmodule VkFFTModule; CUfunction VkFFTKernel; void* bufferLUT; void* bufferRaderUintLUT; CUdeviceptr consts_addr; void** bufferBluestein; void** bufferBluesteinFFT; #elif(VKFFT_BACKEND==2) void** inputBuffer; void** outputBuffer; hipModule_t VkFFTModule; hipFunction_t VkFFTKernel; void* bufferLUT; void* bufferRaderUintLUT; hipDeviceptr_t consts_addr; void** bufferBluestein; void** bufferBluesteinFFT; #elif(VKFFT_BACKEND==3) cl_mem* inputBuffer; cl_mem* outputBuffer; cl_program program; cl_kernel kernel; cl_mem bufferLUT; cl_mem bufferRaderUintLUT; cl_mem* bufferBluestein; cl_mem* bufferBluesteinFFT; #elif(VKFFT_BACKEND==4) void** inputBuffer; void** outputBuffer; ze_module_handle_t VkFFTModule; ze_kernel_handle_t VkFFTKernel; void* bufferLUT; void* bufferRaderUintLUT; void** bufferBluestein; void** bufferBluesteinFFT; #elif(VKFFT_BACKEND==5) MTL::Buffer** inputBuffer; MTL::Buffer** outputBuffer; MTL::Library* library; MTL::ComputePipelineState* pipeline; MTL::Buffer* bufferLUT; MTL::Buffer* bufferRaderUintLUT; MTL::Buffer** bufferBluestein; MTL::Buffer** bufferBluesteinFFT; #endif void* binary; uint64_t binarySize; uint64_t bufferLUTSize; uint64_t bufferRaderUintLUTSize; uint64_t referenceLUT; } VkFFTAxis; typedef struct { uint64_t actualFFTSizePerAxis[3][3]; uint64_t numAxisUploads[3]; uint64_t axisSplit[3][4]; VkFFTAxis axes[3][4]; uint64_t multiUploadR2C; uint64_t actualPerformR2CPerAxis[3]; // automatically specified, shows if R2C is actually performed or inside FFT or as a separate step VkFFTAxis R2Cdecomposition; VkFFTAxis inverseBluesteinAxes[3][4]; } VkFFTPlan; typedef struct { VkFFTConfiguration configuration; VkFFTPlan* localFFTPlan; VkFFTPlan* localFFTPlan_inverse; //additional inverse plan uint64_t actualNumBatches; uint64_t firstAxis; uint64_t lastAxis; //Bluestein buffers reused among plans uint64_t useBluesteinFFT[3]; #if(VKFFT_BACKEND==0) VkDeviceMemory bufferRaderUintLUTDeviceMemory[3][4]; VkBuffer bufferRaderUintLUT[3][4]; VkDeviceMemory bufferBluesteinDeviceMemory[3]; VkDeviceMemory bufferBluesteinFFTDeviceMemory[3]; VkDeviceMemory bufferBluesteinIFFTDeviceMemory[3]; VkBuffer bufferBluestein[3]; VkBuffer bufferBluesteinFFT[3]; VkBuffer bufferBluesteinIFFT[3]; #elif(VKFFT_BACKEND==1) void* bufferRaderUintLUT[3][4]; void* bufferBluestein[3]; void* bufferBluesteinFFT[3]; void* bufferBluesteinIFFT[3]; #elif(VKFFT_BACKEND==2) void* bufferRaderUintLUT[3][4]; void* bufferBluestein[3]; void* bufferBluesteinFFT[3]; void* bufferBluesteinIFFT[3]; #elif(VKFFT_BACKEND==3) cl_mem bufferRaderUintLUT[3][4]; cl_mem bufferBluestein[3]; cl_mem bufferBluesteinFFT[3]; cl_mem bufferBluesteinIFFT[3]; #elif(VKFFT_BACKEND==4) void* bufferRaderUintLUT[3][4]; void* bufferBluestein[3]; void* bufferBluesteinFFT[3]; void* bufferBluesteinIFFT[3]; #elif(VKFFT_BACKEND==5) MTL::Buffer* bufferRaderUintLUT[3][4]; MTL::Buffer* bufferBluestein[3]; MTL::Buffer* bufferBluesteinFFT[3]; MTL::Buffer* bufferBluesteinIFFT[3]; #endif uint64_t bufferRaderUintLUTSize[3][4]; uint64_t bufferBluesteinSize[3]; void* applicationBluesteinString[3]; uint64_t applicationBluesteinStringSize[3]; uint64_t numRaderFFTPrimes; uint64_t rader_primes[30]; uint64_t rader_buffer_size[30]; void* raderFFTkernel[30]; uint64_t applicationStringOffsetRader; uint64_t currentApplicationStringPos; uint64_t applicationStringSize;//size of saveApplicationString in bytes void* saveApplicationString;//memory array(uint32_t* for Vulkan, char* for CUDA/HIP/OpenCL) through which user can access VkFFT generated binaries. (will be allocated by VkFFT, deallocated with deleteVkFFT call) } VkFFTApplication; static inline VkFFTResult VkAppendLine(VkFFTSpecializationConstantsLayout* sc) { //appends code line stored in tempStr to generated code if (sc->tempLen < 0) return VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER; if (sc->currentLen + sc->tempLen > sc->maxCodeLength) return VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER; sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", sc->tempStr); return VKFFT_SUCCESS; } static inline VkFFTResult VkAppendLineFromInput(VkFFTSpecializationConstantsLayout* sc, const char* in) { //appends code line stored in tempStr to generated code if (sc->currentLen + (int64_t)strlen(in) > sc->maxCodeLength) return VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER; sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", in); return VKFFT_SUCCESS; } static inline VkFFTResult appendLicense(VkFFTSpecializationConstantsLayout* sc) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ // This file is part of VkFFT, a Vulkan Fast Fourier Transform library\n\ //\n\ // Copyright (C) 2020 - present Dmitrii Tolmachev \n\ //\n\ // Permission is hereby granted, free of charge, to any person obtaining a copy\n\ // of this software and associated documentation files (the \"Software\"), to deal\n\ // in the Software without restriction, including without limitation the rights\n\ // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n\ // copies of the Software, and to permit persons to whom the Software is\n\ // furnished to do so, subject to the following conditions:\n\ //\n\ // The above copyright notice and this permission notice shall be included in\n\ // all copies or substantial portions of the Software.\n\ //\n\ // THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n\ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n\ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n\ // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n\ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n\ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n\ // THE SOFTWARE.\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkMovComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", out, in); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkMovReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", out, in); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkSharedStore(VkFFTSpecializationConstantsLayout* sc, const char* id, const char* in) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", id, in); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkSharedLoad(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* id) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", out, id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkAddReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s;\n", out, in_1, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkAddComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s;\n", out, in_1, in_2); #else sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x + %s.x;\n\ %s.y = %s.y + %s.y;\n", out, in_1, in_2, out, in_1, in_2); #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkAddComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "\ %s = - %s - %s;\n", out, in_1, in_2); #else sc->tempLen = sprintf(sc->tempStr, "\ %s.x = - %s.x - %s.x;\n\ %s.y = - %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2); #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkAddComplex_x(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x + %s.x;\n", out, in_1, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkAddComplex_y(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = %s.y + %s.y;\n", out, in_1, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkSubComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "\ %s = %s - %s;\n", out, in_1, in_2); #else sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x - %s.x;\n\ %s.y = %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2); #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkSubComplex_x(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x - %s.x;\n", out, in_1, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkSubComplex_y(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = %s.y - %s.y;\n", out, in_1, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkSubReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s - %s;\n", out, in_1, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkFMA3Complex(VkFFTSpecializationConstantsLayout* sc, const char* out_1, const char* out_2, const char* in_1, const char* in_num, const char* in_conj, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, in_1, in_1, in_conj, in_conj); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; /*#if(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x;\n\ %s.y = %s.y;\n", temp, in_1, temp, in_conj); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s.x + %s;\n", out_1, temp, in_num, out_1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.y;\n\ %s.y = %s.x;\n", temp, in_1, temp, in_conj); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s.y + %s;\n", out_2, temp, in_num, out_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else*/ sc->tempLen = sprintf(sc->tempStr, "\ %s.x = fma(%s.x, %s.x, %s.x);\n\ %s.y = fma(%s.y, %s.x, %s.y);\n", out_1, in_1, in_num, out_1, out_1, in_conj, in_num, out_1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = fma(%s.y, %s.y, %s.x);\n\ %s.y = fma(%s.x, %s.y, %s.y);\n", out_2, in_1, in_num, out_2, out_2, in_conj, in_num, out_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //#endif /*sc->tempLen = sprintf(sc->tempStr, "\ temp2.x = fma(%s.x, %s.x, %s.x);\n\ %s.x = temp2.x;\n\ temp2.y = fma(%s.y, %s.x, %s.y);\n\ %s.y = temp2.y;\n", in_1, in_num, out_1, out_1, in_conj, in_num, out_1, out_1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ temp2.x = fma(%s.y, %s.y, %s.x);\n\ %s.x = temp2.x;\n\ temp2.y = fma(%s.x, %s.y, %s.y);\n\ %s.y = temp2.y;\n", in_1, in_num, out_2, out_2, in_conj, in_num, out_2, out_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, out_1, out_1, out_2, out_2); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkFMA3Complex_const_w(VkFFTSpecializationConstantsLayout* sc, const char* out_1, const char* out_2, const char* in_1, const char* in_num_x, const char* in_num_y, const char* in_conj, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, in_1, in_1, in_conj, in_conj); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; #if(VKFFT_BACKEND==2) if (sc->precision == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x;\n\ %s.y = %s.y;\n", temp, in_1, temp, in_conj); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s + %s;\n", out_1, temp, in_num_x, out_1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.y;\n\ %s.y = %s.x;\n", temp, in_1, temp, in_conj); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s + %s;\n", out_2, temp, in_num_y, out_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { #endif sc->tempLen = sprintf(sc->tempStr, "\ %s.x = fma(%s.x, %s, %s.x);\n\ %s.y = fma(%s.y, %s, %s.y);\n", out_1, in_1, in_num_x, out_1, out_1, in_conj, in_num_x, out_1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = fma(%s.y, %s, %s.x);\n\ %s.y = fma(%s.x, %s, %s.y);\n", out_2, in_1, in_num_y, out_2, out_2, in_conj, in_num_y, out_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #if(VKFFT_BACKEND==2) } #endif return res; } static inline VkFFTResult VkFMAComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s + %s;\n", out, in_1, in_num, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, "\ %s.x = fma(%s.x, %s, %s.x);\n\ %s.y = fma(%s.y, %s, %s.y);\n", out, in_1, in_num, in_2, out, in_1, in_num, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult VkFMAReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = fma(%s, %s, %s);\n", out, in_1, in_num, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkMulComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) if (sc->precision == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s.x + %s(-%s.y, %s.x) * %s.y;\n", out, in_1, in_2, sc->vecType, in_1, in_1, in_2); } else { #endif if (strcmp(out, in_1) && strcmp(out, in_2)) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x * %s.x - %s.y * %s.y;\n\ %s.y = %s.y * %s.x + %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2); } else { if (temp) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x * %s.x - %s.y * %s.y;\n\ %s.y = %s.y * %s.x + %s.x * %s.y;\n\ %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp); } else return VKFFT_ERROR_NULL_TEMP_PASSED; } #if(VKFFT_BACKEND==2) } #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkMulComplexConj(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) if (sc->precision == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s.x + %s(%s.y, -%s.x) * %s.y;\n", out, in_1, in_2, sc->vecType, in_1, in_1, in_2); } else { #endif if (strcmp(out, in_1) && strcmp(out, in_2)) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x * %s.x + %s.y * %s.y;\n\ %s.y = %s.y * %s.x - %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2); } else { if (temp) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x * %s.x + %s.y * %s.y;\n\ %s.y = %s.y * %s.x - %s.x * %s.y;\n\ %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp); } else return VKFFT_ERROR_NULL_TEMP_PASSED; } #if(VKFFT_BACKEND==2) } #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkMulComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s;\n", out, in_1, in_num); #else sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x * %s;\n\ %s.y = %s.y * %s;\n", out, in_1, in_num, out, in_1, in_num); #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkMulComplexNumberImag(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) if (sc->precision == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s(-%s.y, %s.x) * %s;\n", out, sc->vecType, in_1, in_1, in_num); } else { #endif if (strcmp(out, in_1)) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = - %s.y * %s;\n\ %s.y = %s.x * %s;\n", out, in_1, in_num, out, in_1, in_num); } else { if (temp) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = - %s.y * %s;\n\ %s.y = %s.x * %s;\n\ %s = %s;\n", temp, in_1, in_num, temp, in_1, in_num, out, temp); } else return VKFFT_ERROR_NULL_TEMP_PASSED; } #if(VKFFT_BACKEND==2) } #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkDivComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x / %s;\n\ %s.y = %s.y / %s;\n", out, in_1, in_num, out, in_1, in_num); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkMulReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %s;\n", out, in_1, in_2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkShuffleComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) if (sc->precision == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s(-%s.y, %s.x);\n", out, in_1, sc->vecType, in_2, in_2); } else { #endif if (strcmp(out, in_2)) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x - %s.y;\n\ %s.y = %s.y + %s.x;\n", out, in_1, in_2, out, in_1, in_2); } else { if (temp) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x - %s.y;\n\ %s.y = %s.x + %s.y;\n\ %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp); } else return VKFFT_ERROR_NULL_TEMP_PASSED; } #if(VKFFT_BACKEND==2) } #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkShuffleComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==2) if (sc->precision == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s(%s.y, -%s.x);\n", out, in_1, sc->vecType, in_2, in_2); } else { #endif if (strcmp(out, in_2)) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x + %s.y;\n\ %s.y = %s.y - %s.x;\n", out, in_1, in_2, out, in_1, in_2); } else { if (temp) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x + %s.y;\n\ %s.y = %s.x - %s.y;\n\ %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp); } else return VKFFT_ERROR_NULL_TEMP_PASSED; } #if(VKFFT_BACKEND==2) } #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkModReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %s;\n", out, in_1, in_num); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkDivReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %s;\n", out, in_1, in_num); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult VkPermute(VkFFTSpecializationConstantsLayout* sc, const uint64_t* permute, const uint64_t num_elem, const uint64_t type, char** regIDs, const char* temp) { VkFFTResult res = VKFFT_SUCCESS; char temp_ID[33][20]; /*uint64_t permute_complete[33]; uint64_t num_completed = 0; uint64_t start = 0; uint64_t start_subcycle = 0;*/ if (type == 0) { for (uint64_t i = 0; i < num_elem; i++) sprintf(temp_ID[i], "%s", sc->locID[i]); for (uint64_t i = 0; i < num_elem; i++) sprintf(sc->locID[i], "%s", temp_ID[permute[i]]); /*for (uint64_t i = 0; i < num_elem; i++) { permute_complete[i] = 0; } while (start != num_elem) { if (permute_complete[start] == 0) { if (start_subcycle == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", temp, sc->locID[start]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; start_subcycle = start; } if (permute[start] == start_subcycle) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", sc->locID[start], temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", sc->locID[start], sc->locID[permute[start]]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } permute_complete[start] = 1; start = permute[start]; } else { start++; start_subcycle = 0; } }*/ } if (type == 1) { for (uint64_t i = 0; i < num_elem; i++) sprintf(temp_ID[i], "%s", regIDs[i]); for (uint64_t i = 0; i < num_elem; i++) sprintf(regIDs[i], "%s", temp_ID[permute[i]]); /*for (uint64_t i = 0; i < num_elem; i++) { permute_complete[i] = 0; } while (start != num_elem) { if (permute_complete[start] == 0) { if (start_subcycle == 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", temp, regIDs[start]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; start_subcycle = start; } if (permute[start] == start_subcycle) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", regIDs[start], temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", regIDs[start], regIDs[permute[start]]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } permute_complete[start] = 1; start = permute[start]; } else { start++; start_subcycle = 0; } }*/ } return res; } static inline VkFFTResult VkSubgroupAdd(VkFFTSpecializationConstantsLayout* sc, const char* in, const char* out, const uint64_t subWarpSplit) { VkFFTResult res = VKFFT_SUCCESS; #if (VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, " %s.x = subgroupAdd(%s.x);\n", out, in); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = subgroupAdd(%s.y);\n", out, in); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif (VKFFT_BACKEND==1) //v1 /*for (int i = 1; i < sc->warpSize / subWarpSplit; i *= 2) { sc->tempLen = sprintf(sc->tempStr, " %s.x += __shfl_xor_sync(0xffffffff, %s.x, %d);\n", out, in, i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y += __shfl_xor_sync(0xffffffff, %s.y, %d);\n", out, in, i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //v2 for (int i = (int)sc->warpSize / 2 / subWarpSplit; i > 0; i /= 2) { sc->tempLen = sprintf(sc->tempStr, " %s.x += __shfl_down_sync(0xffffffff, %s.x, %d);\n", out, in, i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y += __shfl_down_sync(0xffffffff, %s.y, %d);\n", out, in, i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; }*/ #endif return res; } static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration); static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams* launchParams); static inline VkFFTResult appendVersion(VkFFTSpecializationConstantsLayout* sc) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "#version 450\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult appendExtensions(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) //sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_debug_printf : require\n\n"); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; if ((!strcmp(floatType, "double")) || (sc->useUint64)) { sc->tempLen = sprintf(sc->tempStr, "\ #extension GL_ARB_gpu_shader_fp64 : enable\n\ #extension GL_ARB_gpu_shader_int64 : enable\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((!strcmp(floatTypeInputMemory, "half")) || (!strcmp(floatTypeOutputMemory, "half")) || (!strcmp(floatTypeKernelMemory, "half"))) { sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_shader_16bit_storage : require\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #elif(VKFFT_BACKEND==1) #elif(VKFFT_BACKEND==2) #ifdef VKFFT_OLD_ROCM sc->tempLen = sprintf(sc->tempStr, "\ #include \n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if ((!strcmp(floatType, "double")) || (sc->useUint64)) { sc->tempLen = sprintf(sc->tempStr, "\ #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "\ #include \n\ using namespace metal;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult appendLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "layout (local_size_x = %" PRIu64 ", local_size_y = %" PRIu64 ", local_size_z = %" PRIu64 ") in;\n", sc->localSize[0], sc->localSize[1], sc->localSize[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) #elif(VKFFT_BACKEND==2) #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) #endif return res; } static inline VkFFTResult appendConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name, const char* defaultVal, const char* LFending) { VkFFTResult res = VKFFT_SUCCESS; #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "__constant %s %s = %s%s;\n", type, name, defaultVal, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "constant %s %s = %s%s;\n", type, name, defaultVal, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, "const %s %s = %s%s;\n", type, name, defaultVal, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult appendPushConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name) { VkFFTResult res = VKFFT_SUCCESS; sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", type, name); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult appendBarrierVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t numTab) { VkFFTResult res = VKFFT_SUCCESS; char tabs[100]; for (uint64_t i = 0; i < numTab; i++) sprintf(tabs, " "); #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "%sbarrier();\n\n", tabs); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "%sbarrier(CLK_LOCAL_MEM_FENCE);\n\n", tabs); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "%sthreadgroup_barrier(mem_flags::mem_none);\n\n", tabs); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult appendPushConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) { VkFFTResult res = VKFFT_SUCCESS; if (sc->pushConstantsStructSize == 0) return res; #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "layout(push_constant) uniform PushConsts\n{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif if (sc->performWorkGroupShift[0]) { res = appendPushConstant(sc, uintType, "workGroupShiftX"); if (res != VKFFT_SUCCESS) return res; } if (sc->performWorkGroupShift[1]) { res = appendPushConstant(sc, uintType, "workGroupShiftY"); if (res != VKFFT_SUCCESS) return res; } if (sc->performWorkGroupShift[2]) { res = appendPushConstant(sc, uintType, "workGroupShiftZ"); if (res != VKFFT_SUCCESS) return res; } if (sc->performPostCompilationInputOffset) { res = appendPushConstant(sc, uintType, "inputOffset"); if (res != VKFFT_SUCCESS) return res; } if (sc->performPostCompilationOutputOffset) { res = appendPushConstant(sc, uintType, "outputOffset"); if (res != VKFFT_SUCCESS) return res; } if (sc->performPostCompilationKernelOffset) { res = appendPushConstant(sc, uintType, "kernelOffset"); if (res != VKFFT_SUCCESS) return res; } #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "} consts;\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) { VkFFTResult res = VKFFT_SUCCESS; char LFending[4] = ""; char uintType_32[30]; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); sprintf(uintType_32, "uint"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "double")) sprintf(LFending, "l"); sprintf(uintType_32, "unsigned int"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "double")) sprintf(LFending, "l"); sprintf(uintType_32, "unsigned int"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(uintType_32, "unsigned int"); //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); sprintf(uintType_32, "uint"); #endif //res = appendConstant(sc, floatType, "loc_PI", "3.1415926535897932384626433832795", LFending); //if (res != VKFFT_SUCCESS) return res; //res = appendConstant(sc, floatType, "loc_SQRT1_2", "0.70710678118654752440084436210485", LFending); //if (res != VKFFT_SUCCESS) return res; if (sc->useRader) { for (uint64_t i = 0; i < sc->numRaderPrimes; i++) { if (sc->raderContainer[i].prime > 0) { if (sc->inline_rader_g_pow == 1) { uint64_t g_pow = 1; #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "__constant %s g_pow_%" PRIu64 "[%" PRIu64 "]= {1", uintType_32, sc->raderContainer[i].prime, sc->raderContainer[i].prime); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "constant %s g_pow_%" PRIu64 "[%" PRIu64 "]= {1", uintType_32, sc->raderContainer[i].prime, sc->raderContainer[i].prime); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, "const %s g_pow_%" PRIu64 "[%" PRIu64 "]= {1", uintType_32, sc->raderContainer[i].prime, sc->raderContainer[i].prime); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1; t++) { g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime; sc->tempLen = sprintf(sc->tempStr, ", %" PRIu64 "", g_pow); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "};\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inline_rader_kernel) { #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "__constant %s r_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "constant %s r_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, "const %s r_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif if (sc->raderContainer[i].type == 0) { for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later if (!strcmp(floatType, "double")) { double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "float")) { float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (j < (sc->raderContainer[i].prime - 2)) { sc->tempLen = sprintf(sc->tempStr, ", "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "};\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later uint64_t g_pow = 1; for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) { g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime; } if (!strcmp(floatType, "double")) { double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", (double)cos(2.0 * g_pow * double_PI / sc->raderContainer[i].prime), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "float")) { float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", (float)cos(2.0 * g_pow * double_PI / sc->raderContainer[i].prime), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (j < (sc->raderContainer[i].prime - 2)) { sc->tempLen = sprintf(sc->tempStr, ", "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "};\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "__constant %s i_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "constant %s i_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, "const %s i_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif if (sc->raderContainer[i].type == 0) { for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later if (!strcmp(floatType, "double")) { double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "float")) { float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (j < (sc->raderContainer[i].prime - 2)) { sc->tempLen = sprintf(sc->tempStr, ", "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "};\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later uint64_t g_pow = 1; for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1 - j; t++) { g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime; } if (!strcmp(floatType, "double")) { double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", (double)(-sin(2.0 * g_pow * double_PI / sc->raderContainer[i].prime)), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "float")) { float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", (float)(-sin(2.0 * g_pow * double_PI / sc->raderContainer[i].prime)), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (j < (sc->raderContainer[i].prime - 2)) { sc->tempLen = sprintf(sc->tempStr, ", "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "};\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } } } return res; } static inline VkFFTResult appendSinCos20(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) { VkFFTResult res = VKFFT_SUCCESS; char functionDefinitions[100] = ""; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); sprintf(functionDefinitions, "__device__ static __inline__ "); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); sprintf(functionDefinitions, "__device__ static __inline__ "); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); sprintf(functionDefinitions, "static __inline__ "); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "half")) sprintf(vecType, "half2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif #if(VKFFT_BACKEND==0) res = appendConstant(sc, floatType, "loc_2_PI", "0.63661977236758134307553505349006", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "loc_PI_2", "1.5707963267948966192313216916398", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a1", "0.99999999999999999999962122687403772", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a3", "-0.166666666666666666637194166219637268", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a5", "0.00833333333333333295212653322266277182", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a7", "-0.000198412698412696489459896530659927773", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a9", "2.75573192239364018847578909205399262e-6", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a11", "-2.50521083781017605729370231280411712e-8", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a13", "1.60590431721336942356660057796782021e-10", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a15", "-7.64712637907716970380859898835680587e-13", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "a17", "2.81018528153898622636194976499656274e-15", LFending); if (res != VKFFT_SUCCESS) return res; res = appendConstant(sc, floatType, "ab", "-7.97989713648499642889739108679114937e-18", LFending); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s%s sincos_20(double x)\n\ {\n\ //minimax coefs for sin for 0..pi/2 range\n\ double y = abs(x * loc_2_PI);\n\ double q = floor(y);\n\ int quadrant = int(q);\n\ double t = (quadrant & 1) != 0 ? 1 - y + q : y - q;\n\ t *= loc_PI_2;\n\ double t2 = t * t;\n\ double r = fma(fma(fma(fma(fma(fma(fma(fma(fma(ab, t2, a17), t2, a15), t2, a13), t2, a11), t2, a9), t2, a7), t2, a5), t2, a3), t2 * t, t);\n\ %s cos_sin;\n\ cos_sin.x = ((quadrant == 0) || (quadrant == 3)) ? sqrt(1 - r * r) : -sqrt(1 - r * r);\n\ r = x < 0 ? -r : r;\n\ cos_sin.y = (quadrant & 2) != 0 ? -r : r;\n\ return cos_sin;\n\ }\n\n", functionDefinitions, vecType, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif ((VKFFT_BACKEND == 1) || (VKFFT_BACKEND == 2)) sc->tempLen = sprintf(sc->tempStr, "\ %s%s sincos_20(%s x)\n\ {\n\ %s cos_sin;\n\ sincos(x, &cos_sin.y, &cos_sin.x);\n\ return cos_sin;\n\ }\n\n", functionDefinitions, vecType, floatType, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ %s%s normalize(%s v)\n\ {\n\ %s inv_norm = rsqrt(v.x*v.x + v.y*v.y);\n\ v.x = v.x * inv_norm;\n\ v.y = v.y * inv_norm;\n\ return v;\n\ }\n", functionDefinitions, vecType, vecType, floatType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ #elif((VKFFT_BACKEND == 3) || (VKFFT_BACKEND == 4)) sc->tempLen = sprintf(sc->tempStr, "\ %s%s sincos_20(%s x)\n\ {\n\ %s cos_sin;\n\ %s cos_val;\n\ cos_sin.y = sincos(x, &cos_val);\n\ cos_sin.x = cos_val;\n\ return cos_sin;\n\ }\n\n", functionDefinitions, vecType, floatType, vecType, floatType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult appendConversion(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeDifferent) { VkFFTResult res = VKFFT_SUCCESS; #if(VKFFT_BACKEND!=0) char functionDefinitions[100] = ""; char vecType[30]; char vecTypeDifferent[30]; #endif #if(VKFFT_BACKEND==0) #elif(VKFFT_BACKEND==1) sprintf(functionDefinitions, "__device__ static __inline__ "); #elif(VKFFT_BACKEND==2) sprintf(functionDefinitions, "__device__ static __inline__ "); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(functionDefinitions, "static __inline__ "); #endif #if(VKFFT_BACKEND!=0) if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeDifferent, "half")) sprintf(vecTypeDifferent, "f16vec2"); if (!strcmp(floatTypeDifferent, "float")) sprintf(vecTypeDifferent, "float2"); if (!strcmp(floatTypeDifferent, "double")) sprintf(vecTypeDifferent, "double2"); sc->tempLen = sprintf(sc->tempStr, "\ %s%s conv_%s(%s input)\n\ {\n\ %s ret_val;\n\ ret_val.x = (%s) input.x;\n\ ret_val.y = (%s) input.y;\n\ return ret_val;\n\ }\n\n", functionDefinitions, vecType, vecType, vecTypeDifferent, vecType, floatType, floatType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s%s conv_%s(%s input)\n\ {\n\ %s ret_val;\n\ ret_val.x = (%s) input.x;\n\ ret_val.y = (%s) input.y;\n\ return ret_val;\n\ }\n\n", functionDefinitions, vecTypeDifferent, vecTypeDifferent, vecType, vecTypeDifferent, floatTypeDifferent, floatTypeDifferent); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif return res; } static inline VkFFTResult appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t inputType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; switch (inputType) { case 0: case 1: case 2: case 3: case 4: case 6: { #if(VKFFT_BACKEND==0) if (!strcmp(floatTypeMemory, "half")) { sc->inputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->inputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "vec2"); } if (!strcmp(floatTypeMemory, "double")) { sc->inputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "dvec2"); } if (sc->inputBufferBlockNum == 1) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ %s inputs[%" PRIu64 "];\n\ };\n\n", id, vecType, sc->inputBufferBlockSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ %s inputs[%" PRIu64 "];\n\ } inputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->inputBufferBlockSize, sc->inputBufferBlockNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #elif(VKFFT_BACKEND==1) if (!strcmp(floatTypeMemory, "half")) { sc->inputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->inputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->inputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif(VKFFT_BACKEND==2) if (!strcmp(floatTypeMemory, "half")) { sc->inputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->inputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->inputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatTypeMemory, "half")) { sc->inputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->inputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->inputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif(VKFFT_BACKEND==5) if (!strcmp(floatTypeMemory, "half")) { sc->inputNumberByteSize = 2 * 2; sprintf(vecType, "half2"); } if (!strcmp(floatTypeMemory, "float")) { sc->inputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->inputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #endif break; } case 5: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145: { if (!strcmp(floatTypeMemory, "half")) { sc->inputNumberByteSize = 2; sprintf(vecType, "float16_t"); } if (!strcmp(floatTypeMemory, "float")) { sc->inputNumberByteSize = sizeof(float); sprintf(vecType, "float"); } if (!strcmp(floatTypeMemory, "double")) { sc->inputNumberByteSize = sizeof(double); sprintf(vecType, "double"); } #if(VKFFT_BACKEND==0) if (sc->inputBufferBlockNum == 1) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ %s inputs[%" PRIu64 "];\n\ };\n\n", id, vecType, 2 * sc->inputBufferBlockSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ %s inputs[%" PRIu64 "];\n\ } inputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->inputBufferBlockSize, sc->inputBufferBlockNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif break; } } return res; } static inline VkFFTResult appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t outputType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; switch (outputType) { case 0: case 1: case 2: case 3: case 4: case 5: { #if(VKFFT_BACKEND==0) if (!strcmp(floatTypeMemory, "half")) { sc->outputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->outputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "vec2"); } if (!strcmp(floatTypeMemory, "double")) { sc->outputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "dvec2"); } if (sc->outputBufferBlockNum == 1) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ %s outputs[%" PRIu64 "];\n\ };\n\n", id, vecType, sc->outputBufferBlockSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ %s outputs[%" PRIu64 "];\n\ } outputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->outputBufferBlockSize, sc->outputBufferBlockNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #elif(VKFFT_BACKEND==1) if (!strcmp(floatTypeMemory, "half")) { sc->outputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->outputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->outputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif(VKFFT_BACKEND==2) if (!strcmp(floatTypeMemory, "half")) { sc->outputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->outputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->outputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatTypeMemory, "half")) { sc->outputNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->outputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->outputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif(VKFFT_BACKEND==5) if (!strcmp(floatTypeMemory, "half")) { sc->outputNumberByteSize = 2 * 2; sprintf(vecType, "half2"); } if (!strcmp(floatTypeMemory, "float")) { sc->outputNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->outputNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #endif break; } case 6: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145: { if (!strcmp(floatTypeMemory, "half")) { sc->outputNumberByteSize = 2; sprintf(vecType, "float16_t"); } if (!strcmp(floatTypeMemory, "float")) { sc->outputNumberByteSize = sizeof(float); sprintf(vecType, "float"); } if (!strcmp(floatTypeMemory, "double")) { sc->outputNumberByteSize = sizeof(double); sprintf(vecType, "double"); } #if(VKFFT_BACKEND==0) if (sc->outputBufferBlockNum == 1) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ %s outputs[%" PRIu64 "];\n\ };\n\n", id, vecType, 2 * sc->outputBufferBlockSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ %s outputs[%" PRIu64 "];\n\ } outputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->outputBufferBlockSize, sc->outputBufferBlockNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif break; } } return res; } static inline VkFFTResult appendKernelLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; #if(VKFFT_BACKEND==0) if (!strcmp(floatTypeMemory, "half")) { sc->kernelNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->kernelNumberByteSize = 2 * sizeof(float); sprintf(vecType, "vec2"); } if (!strcmp(floatTypeMemory, "double")) { sc->kernelNumberByteSize = 2 * sizeof(double); sprintf(vecType, "dvec2"); } if (sc->kernelBlockNum == 1) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\ %s kernel_obj[%" PRIu64 "];\n\ };\n\n", id, vecType, sc->kernelBlockSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\ %s kernel_obj[%" PRIu64 "];\n\ } kernelBlocks[%" PRIu64 "];\n\n", id, vecType, sc->kernelBlockSize, sc->kernelBlockNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #elif(VKFFT_BACKEND==1) if (!strcmp(floatTypeMemory, "half")) { sc->kernelNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->kernelNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->kernelNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif(VKFFT_BACKEND==2) if (!strcmp(floatTypeMemory, "half")) { sc->kernelNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->kernelNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->kernelNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatTypeMemory, "half")) { sc->kernelNumberByteSize = 2 * 2; sprintf(vecType, "f16vec2"); } if (!strcmp(floatTypeMemory, "float")) { sc->kernelNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->kernelNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #elif(VKFFT_BACKEND==5) if (!strcmp(floatTypeMemory, "half")) { sc->kernelNumberByteSize = 2 * 2; sprintf(vecType, "half2"); } if (!strcmp(floatTypeMemory, "float")) { sc->kernelNumberByteSize = 2 * sizeof(float); sprintf(vecType, "float2"); } if (!strcmp(floatTypeMemory, "double")) { sc->kernelNumberByteSize = 2 * sizeof(double); sprintf(vecType, "double2"); } #endif return res; } static inline VkFFTResult appendLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") readonly buffer DataLUT {\n\ %s twiddleLUT[];\n\ };\n", id, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #endif return res; } static inline VkFFTResult appendRaderUintLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id) { VkFFTResult res = VKFFT_SUCCESS; char uintType_32[30]; #if(VKFFT_BACKEND==0) sprintf(uintType_32, "uint"); sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") readonly buffer DataRaderUintLUT {\n\ %s g_pow[];\n\ };\n", id, uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) sprintf(uintType_32, "unsigned int"); #elif(VKFFT_BACKEND==2) sprintf(uintType_32, "unsigned int"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(uintType_32, "unsigned int"); #elif(VKFFT_BACKEND==5) sprintf(uintType_32, "uint"); #endif return res; } static inline VkFFTResult appendBluesteinLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; #if(VKFFT_BACKEND==0) uint64_t loc_id = id; if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (sc->BluesteinConvolutionStep) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinConvolutionKernel {\n\ %s BluesteinConvolutionKernel[];\n\ };\n", loc_id, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; loc_id++; } if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { sc->tempLen = sprintf(sc->tempStr, "\ layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinMultiplication {\n\ %s BluesteinMultiplication[];\n\ };\n", loc_id, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; loc_id++; } #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #endif return res; } static inline VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t inputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) { VkFFTResult res = VKFFT_SUCCESS; switch (inputType % 1000) { case 0: case 2: case 3: case 4:case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {//single_c2c + single_c2c_strided char inputOffset[30] = ""; if (sc->inputOffset > 0) { sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize); } else { if (sc->performPostCompilationInputOffset) { if (inputType < 1000) sprintf(inputOffset, "consts.inputOffset + "); else sprintf(inputOffset, "consts.kernelOffset + "); } } char shiftX[500] = ""; if (sc->inputStride[0] == 1) sprintf(shiftX, "(%s)", index_x); else sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]); char shiftY[500] = ""; uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->size[1] > 1) { if (sc->numAxisUploads == 1) { if (sc->axisSwapped) { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]); else sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]); } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]); else sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]); } } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]); else sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]); } } char shiftZ[500] = ""; if (sc->size[2] > 1) { if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); else sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); } else { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]); else sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]); } } char shiftCoordinate[500] = ""; uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; if (sc->numCoordinates * sc->matrixConvolution > 1) { sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]); } if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { maxCoordinate = 1; sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]); } char shiftBatch[500] = ""; if ((sc->numBatches > 1) || (sc->numKernels > 1)) { if (sc->convolutionStep && (sc->numKernels > 1)) { sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]); } else sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]); } sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c char inputOffset[30] = ""; if (sc->inputOffset > 0) { sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize); } else { if (sc->performPostCompilationInputOffset) { if (inputType < 1000) sprintf(inputOffset, "consts.inputOffset + "); else sprintf(inputOffset, "consts.kernelOffset + "); } } char shiftX[500] = ""; if (sc->inputStride[0] == 1) sprintf(shiftX, "(%s)", index_x); else sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]); char shiftY[500] = ""; if (index_y) sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->inputStride[1]); char shiftZ[500] = ""; if (sc->size[2] > 1) { if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); else sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); } else { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]); else sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]); } } char shiftCoordinate[500] = ""; uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; if (sc->numCoordinates * sc->matrixConvolution > 1) { sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]); } if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { maxCoordinate = 1; sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]); } char shiftBatch[500] = ""; if ((sc->numBatches > 1) || (sc->numKernels > 1)) { if (sc->convolutionStep && (sc->numKernels > 1)) { sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]); } else sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]); } sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } } return res; } static inline VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t outputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) { VkFFTResult res = VKFFT_SUCCESS; switch (outputType % 1000) {//single_c2c + single_c2c_strided case 0: case 2: case 3: case 4: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { char outputOffset[30] = ""; if (sc->outputOffset > 0) { sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize); } else { if (sc->performPostCompilationOutputOffset) { if (outputType < 1000) sprintf(outputOffset, "consts.outputOffset + "); else sprintf(outputOffset, "consts.kernelOffset + "); } } char shiftX[500] = ""; if (sc->numAxisUploads == 1) sprintf(shiftX, "(%s)", index_x); else sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]); char shiftY[500] = ""; uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->size[1] > 1) { if (sc->numAxisUploads == 1) { if (sc->axisSwapped) { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]); else sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]); } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]); else sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]); } } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]); else sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]); } } char shiftZ[500] = ""; if (sc->size[2] > 1) { if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); else sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); } else { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]); else sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]); } } char shiftCoordinate[500] = ""; uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; if (sc->numCoordinates * sc->matrixConvolution > 1) { sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]); } if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { maxCoordinate = 1; sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]); } char shiftBatch[500] = ""; if ((sc->numBatches > 1) || (sc->numKernels > 1)) { if (sc->convolutionStep && (sc->numKernels > 1)) { sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]); } else sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]); } sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c char outputOffset[30] = ""; if (sc->outputOffset > 0) { sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize); } else { if (sc->performPostCompilationOutputOffset) { if (outputType < 1000) sprintf(outputOffset, "consts.outputOffset + "); else sprintf(outputOffset, "consts.kernelOffset + "); } } char shiftX[500] = ""; if (sc->numAxisUploads == 1) sprintf(shiftX, "(%s)", index_x); else sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]); char shiftY[500] = ""; if (index_y) sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->outputStride[1]); char shiftZ[500] = ""; if (sc->size[2] > 1) { if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); else sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); } else { if (sc->performWorkGroupShift[2]) sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]); else sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]); } } char shiftCoordinate[500] = ""; uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; if (sc->numCoordinates * sc->matrixConvolution > 1) { sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]); } if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { maxCoordinate = 1; sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]); } char shiftBatch[500] = ""; if ((sc->numBatches > 1) || (sc->numKernels > 1)) { if (sc->convolutionStep && (sc->numKernels > 1)) { sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]); } else sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]); } sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } } return res; } static inline VkFFTResult inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t radix, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, char** regID) { VkFFTResult res = VKFFT_SUCCESS; long double double_PI = 3.14159265358979323846264338327950288419716939937510L; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char* temp = sc->temp; //sprintf(temp, "loc_0"); char* w = sc->w; //sprintf(w, "w"); char* iw = sc->iw; //sprintf(iw, "iw"); char convolutionInverse[30] = ""; if (sc->convolutionStep) sprintf(convolutionInverse, ", %s inverse", uintType); switch (radix) { case 2: { /*if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s LUTId) {\n", vecType, vecType, uintType); } else { sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s angle) {\n", vecType, vecType, floatType); }*/ /*VkAppendLine(sc, " {\n"); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " {\n\ %s temp;\n", vecType);*/ if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = VkMulComplex(sc, temp, regID[1], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[1], regID[0], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[0], regID[0], temp); if (res != VKFFT_SUCCESS) return res; /*VkAppendLine(sc, " }\n"); sc->tempLen = sprintf(sc->tempStr, "\ temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\ }\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/ break; } case 3: { /* if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s LUTId) {\n", vecType, vecType, vecType, uintType); } else { sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s angle) {\n", vecType, vecType, vecType, floatType); }*/ char* tf[2]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 2; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-0.5%s", LFending); sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); /*for (uint64_t i = 0; i < 3; i++) { sc->locID[i] = (char*)malloc(sizeof(char) * 50); sprintf(sc->locID[i], "loc_%" PRIu64 "", i); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; }*/ if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 4.0 / 3.0, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 4.0 / 3.0, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 4.0 / 3.0, 4.0 / 3.0); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 4.0 / 3.0, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = VkMulComplex(sc, sc->locID[2], regID[2], w, 0); /*sc->tempLen = sprintf(sc->tempStr, "\ loc_2.x = temp%s.x * w.x - temp%s.y * w.y;\n\ loc_2.y = temp%s.y * w.x + temp%s.x * w.y;\n", regID[2], regID[2], regID[2], regID[2]);*/ if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 / 3.0, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 / 3.0, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 / 3.0, 2.0 / 3.0); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s=sincos_20(angle*%.17e%s);\n", w, 2.0 / 3.0, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = VkMulComplex(sc, sc->locID[1], regID[1], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[2], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = loc_1 + loc_2;\n\ temp%s = loc_1 - loc_2;\n", regID[1], regID[2]);*/ res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], regID[2], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[0], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_0 = temp%s + temp%s;\n\ loc_1 = temp%s - 0.5 * temp%s;\n\ loc_2 = -0.8660254037844386467637231707529 * temp%s;\n\ temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[1], regID[2], regID[0]);*/ if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[2], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s.x = loc_1.x - loc_2.y; \n\ temp%s.y = loc_1.y + loc_2.x; \n\ temp%s.x = loc_1.x + loc_2.y; \n\ temp%s.y = loc_1.y - loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/ } else { res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[2], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s.x = loc_1.x + loc_2.y; \n\ temp%s.y = loc_1.y - loc_2.x; \n\ temp%s.x = loc_1.x - loc_2.y; \n\ temp%s.y = loc_1.y + loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/ } //VkAppendLine(sc, " }\n"); for (uint64_t i = 0; i < 2; i++) { free(tf[i]); tf[i] = 0; //free(sc->locID[i]); } //free(sc->locID[2]); break; } case 4: { /*if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, uintType, convolutionInverse); else sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s angle%s) {\n", vecType, vecType, vecType, vecType, floatType, convolutionInverse); */ //VkAppendLine(sc, " {\n"); //sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp); //res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = VkMulComplex(sc, temp, regID[2], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[2], regID[0], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[0], regID[0], temp); if (res != VKFFT_SUCCESS) return res; res = VkMulComplex(sc, temp, regID[3], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[3], regID[1], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[1], regID[1], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n\ temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n\ //DIF 2nd stage with angle\n", regID[2], regID[2], regID[2], regID[2], regID[2], regID[0], regID[0], regID[0], regID[3], regID[3], regID[3], regID[3], regID[3], regID[1], regID[1], regID[1]);*/ if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s=twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.5%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } res = VkMulComplex(sc, temp, regID[1], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[1], regID[0], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[0], regID[0], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/ if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x;", temp, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x;", temp, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(-w.y, w.x);\n\n", vecType); } res = VkMulComplex(sc, temp, regID[3], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[3], regID[2], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[2], regID[2], temp); if (res != VKFFT_SUCCESS) return res; //res = VkMovComplex(sc, temp, regID[1]); //if (res != VKFFT_SUCCESS) return res; uint64_t permute2[4] = { 0,2,1,3 }; res = VkPermute(sc, permute2, 4, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; /*res = VkMovComplex(sc, regID[1], regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[2], temp); if (res != VKFFT_SUCCESS) return res;*/ /*VkAppendLine(sc, " }\n"); sc->tempLen = sprintf(sc->tempStr, "\ temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n\ temp = temp%s;\n\ temp%s = temp%s;\n\ temp%s = temp;\n\ }\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2], regID[1], regID[1], regID[2], regID[2]);*/ break; } case 5: { /*if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType); } else { sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType); }*/ char* tf[5]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 5; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-0.5%s", LFending); sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending); sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending); sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending); sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending); /*for (uint64_t i = 0; i < 5; i++) { sc->locID[i] = (char*)malloc(sizeof(char) * 50); sprintf(sc->locID[i], "loc_%" PRIu64 "", i); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; }*/ /*sc->tempLen = sprintf(sc->tempStr, " {\n\ %s loc_0;\n %s loc_1;\n %s loc_2;\n %s loc_3;\n %s loc_4;\n", vecType, vecType, vecType, vecType, vecType);*/ for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\ loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/ } res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[4], sc->locID[1], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[3], regID[1], regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], regID[3], regID[4]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = loc_1 + loc_4;\n\ temp%s = loc_2 + loc_3;\n\ temp%s = loc_2 - loc_3;\n\ temp%s = loc_1 - loc_4;\n\ loc_3 = temp%s - temp%s;\n\ loc_4 = temp%s + temp%s;\n", regID[1], regID[2], regID[3], regID[4], regID[1], regID[2], regID[3], regID[4]);*/ res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[2], regID[2], tf[0], regID[0], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[3], regID[3], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[4], regID[4], tf[2]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_0 = temp%s + temp%s + temp%s;\n\ loc_1 = temp%s - 0.5 * temp%s;\n\ loc_2 = temp%s - 0.5 * temp%s;\n\ temp%s *= 1.538841768587626701285145288018455;\n\ temp%s *= -0.363271264002680442947733378740309;\n\ loc_3 *= -0.809016994374947424102293417182819;\n\ loc_4 *= -0.587785252292473129168705954639073;\n", regID[0], regID[1], regID[2], regID[0], regID[1], regID[0], regID[2], regID[3], regID[4]);*/ res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[3], regID[3], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[4]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[0], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_1 -= loc_3;\n\ loc_2 += loc_3;\n\ loc_3 = temp%s+loc_4;\n\ loc_4 += temp%s;\n\ temp%s = loc_0;\n", regID[3], regID[4], regID[0]);*/ if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[2], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[3], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[4], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s.x = loc_1.x - loc_4.y; \n\ temp%s.y = loc_1.y + loc_4.x; \n\ temp%s.x = loc_2.x - loc_3.y; \n\ temp%s.y = loc_2.y + loc_3.x; \n\ temp%s.x = loc_2.x + loc_3.y; \n\ temp%s.y = loc_2.y - loc_3.x; \n\ temp%s.x = loc_1.x + loc_4.y; \n\ temp%s.y = loc_1.y - loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/ } else { res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[2], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[4], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s.x = loc_1.x + loc_4.y; \n\ temp%s.y = loc_1.y - loc_4.x; \n\ temp%s.x = loc_2.x + loc_3.y; \n\ temp%s.y = loc_2.y - loc_3.x; \n\ temp%s.x = loc_2.x - loc_3.y; \n\ temp%s.y = loc_2.y + loc_3.x; \n\ temp%s.x = loc_1.x - loc_4.y; \n\ temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/ } //VkAppendLine(sc, " }\n"); for (uint64_t i = 0; i < 5; i++) { free(tf[i]); tf[i] = 0; //free(sc->locID[i]); } break; } case 6: { char* tf[2]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 2; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-0.5%s", LFending); sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, regID[i], regID[i], w, temp); if (res != VKFFT_SUCCESS) return res; } //important //res = VkMovComplex(sc, regID[1], sc->locID[1]); //if (res != VKFFT_SUCCESS) return res; //uint64_t P = 3; uint64_t Q = 2; for (uint64_t i = 0; i < Q; i++) { res = VkMovComplex(sc, sc->locID[0], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } } res = VkMovComplex(sc, temp, regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[1], regID[0], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[0], regID[0], temp); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = -0.5%s;\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.8660254037844386467637231707529%s;\n\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -0.5%s;\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -0.8660254037844386467637231707529%s;\n\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, temp, regID[3], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[3], regID[2], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[2], regID[2], temp); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMulComplex(sc, temp, regID[5], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[5], regID[4], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[4], regID[4], temp); if (res != VKFFT_SUCCESS) return res; uint64_t permute2[6] = { 0,3,4,1,2,5 }; res = VkPermute(sc, permute2, 6, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; /*res = VkMovComplex(sc, temp, regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[1], regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[3], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[2], regID[4]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[4], temp); if (res != VKFFT_SUCCESS) return res;*/ for (uint64_t i = 0; i < 2; i++) { free(tf[i]); tf[i] = 0; } break; } case 7: { /*if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType); } else { sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType); }*/ char* tf[8]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 8; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending); sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending); sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending); sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending); if (stageAngle < 0) { sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending); sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending); sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending); sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending); } else { sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending); sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending); sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending); sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending); } /*for (uint64_t i = 0; i < 7; i++) { sc->locID[i] = (char*)malloc(sizeof(char) * 50); sprintf(sc->locID[i], "loc_%" PRIu64 "", i); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; }*/ for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\ loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/ } res = VkMovComplex(sc, sc->locID[0], regID[0]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[0], sc->locID[1], sc->locID[6]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[1], sc->locID[1], sc->locID[6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[4], sc->locID[4], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[5], sc->locID[4], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_0 = temp%s;\n\ temp%s = loc_1 + loc_6;\n\ temp%s = loc_1 - loc_6;\n\ temp%s = loc_2 + loc_5;\n\ temp%s = loc_2 - loc_5;\n\ temp%s = loc_4 + loc_3;\n\ temp%s = loc_4 - loc_3;\n", regID[0], regID[0], regID[1], regID[2], regID[3], regID[4], regID[5]);*/ res = VkAddComplex(sc, sc->locID[5], regID[1], regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[1], regID[0], regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], sc->locID[0], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_5 = temp%s + temp%s + temp%s;\n\ loc_1 = temp%s + temp%s + temp%s;\n\ loc_0 += loc_1;\n", regID[1], regID[3], regID[5], regID[0], regID[2], regID[4]);*/ res = VkSubComplex(sc, sc->locID[2], regID[0], regID[4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[3], regID[4], regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[4], regID[2], regID[0]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_2 = temp%s - temp%s;\n\ loc_3 = temp%s - temp%s;\n\ loc_4 = temp%s - temp%s;\n", regID[0], regID[4], regID[4], regID[2], regID[2], regID[0]);*/ res = VkSubComplex(sc, regID[0], regID[1], regID[5]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[2], regID[5], regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[4], regID[3], regID[1]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = temp%s - temp%s;\n\ temp%s = temp%s - temp%s;\n\ temp%s = temp%s - temp%s;\n", regID[0], regID[1], regID[5], regID[2], regID[5], regID[3], regID[4], regID[3], regID[1]);*/ res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[0], regID[0], tf[5]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[2], regID[2], tf[6]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[4], regID[4], tf[7]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_1 *= -1.16666666666666651863693004997913;\n\ loc_2 *= 0.79015646852540022404554065360571;\n\ loc_3 *= 0.05585426728964774240049351305970;\n\ loc_4 *= 0.73430220123575240531721419756650;\n\ loc_5 *= 0.44095855184409837868031445395900;\n\ temp%s *= 0.34087293062393136944265847887436;\n\ temp%s *= -0.53396936033772524066165487965918;\n\ temp%s *= 0.87484229096165666561546458979137;\n", regID[0], regID[2], regID[4]);*/ res = VkSubComplex(sc, regID[5], regID[4], regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplexInv(sc, regID[6], regID[4], regID[0]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[4], regID[0], regID[2]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = temp%s - temp%s;\n\ temp%s = - temp%s - temp%s;\n\ temp%s = temp%s + temp%s;\n", regID[5], regID[4], regID[2], regID[6], regID[4], regID[0], regID[4], regID[0], regID[2]);*/ res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[1], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[2], sc->locID[4], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplexInv(sc, regID[3], sc->locID[2], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = loc_0 + loc_1;\n\ temp%s = loc_2 + loc_3;\n\ temp%s = loc_4 - loc_3;\n\ temp%s = - loc_2 - loc_4;\n", regID[0], regID[1], regID[2], regID[3]);*/ res = VkAddComplex(sc, sc->locID[1], regID[0], regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[2], regID[0], regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[3], regID[0], regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], regID[4], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[6], regID[6], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[0], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ loc_1 = temp%s + temp%s;\n\ loc_2 = temp%s + temp%s;\n\ loc_3 = temp%s + temp%s;\n\ loc_4 = temp%s + loc_5;\n\ loc_6 = temp%s + loc_5;\n\ loc_5 += temp%s;\n\ temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[2], regID[0], regID[3], regID[4], regID[6], regID[5], regID[0]);*/ res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[2], sc->locID[3], sc->locID[6], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[5], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[4], sc->locID[2], sc->locID[5], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[5], sc->locID[3], sc->locID[6], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[6], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s.x = loc_1.x + loc_4.y; \n\ temp%s.y = loc_1.y - loc_4.x; \n\ temp%s.x = loc_3.x + loc_6.y; \n\ temp%s.y = loc_3.y - loc_6.x; \n\ temp%s.x = loc_2.x - loc_5.y; \n\ temp%s.y = loc_2.y + loc_5.x; \n\ temp%s.x = loc_2.x + loc_5.y; \n\ temp%s.y = loc_2.y - loc_5.x; \n\ temp%s.x = loc_3.x - loc_6.y; \n\ temp%s.y = loc_3.y + loc_6.x; \n\ temp%s.x = loc_1.x - loc_4.y; \n\ temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4], regID[5], regID[5], regID[6], regID[6]); VkAppendLine(sc, " }\n");*/ /*for (uint64_t i = 0; i < 7; i++) { free(sc->locID[i]); }*/ for (uint64_t i = 0; i < 8; i++) { free(tf[i]); tf[i] = 0; } break; } case 8: { /*if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, uintType, convolutionInverse); else sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s angle%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, floatType, convolutionInverse); */ //VkAppendLine(sc, " {\n"); /*sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } for (uint64_t i = 0; i < 4; i++) { res = VkMulComplex(sc, temp, regID[i + 4], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n", regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 0], regID[i + 0], regID[i + 0]);*/ } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.5%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 2; i++) { res = VkMulComplex(sc, temp, regID[i + 2], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/ } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 4; i < 6; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\ temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/ } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.25%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } res = VkMulComplex(sc, temp, regID[1], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[1], regID[0], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[0], regID[0], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/ if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } res = VkMulComplex(sc, temp, regID[3], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[3], regID[2], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[2], regID[2], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\ temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2]);*/ if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, temp, regID[5], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[5], regID[4], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[4], regID[4], temp); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\ temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n", regID[5], regID[5], regID[5], regID[5], regID[5], regID[4], regID[4], regID[4]);*/ if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(iw.y, -iw.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(-iw.y, iw.x);\n\n", vecType); } res = VkMulComplex(sc, temp, regID[7], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[7], regID[6], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[6], regID[6], temp); if (res != VKFFT_SUCCESS) return res; uint64_t permute2[8] = { 0,4,2,6,1,5,3,7 }; res = VkPermute(sc, permute2, 8, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; /* if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[1], regID[4]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[4], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[3], regID[6]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[6], temp); if (res != VKFFT_SUCCESS) return res;*/ /*sc->tempLen = sprintf(sc->tempStr, "\ temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ temp%s = temp%s - temp;\n\ temp%s = temp%s + temp;\n\n\ temp = temp%s;\n\ temp%s = temp%s;\n\ temp%s = temp;\n\n\ temp = temp%s;\n\ temp%s = temp%s;\n\ temp%s = temp;\n\ }\n\n", regID[7], regID[7], regID[7], regID[7], regID[7], regID[6], regID[6], regID[6], regID[1], regID[1], regID[4], regID[4], regID[3], regID[3], regID[6], regID[6]); //VkAppendLine(sc, " }\n");*/ break; } case 9: { char* tf[2]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 2; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-0.5%s", LFending); sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, regID[i], regID[i], w, temp); if (res != VKFFT_SUCCESS) return res; } //important //res = VkMovComplex(sc, regID[1], sc->locID[1]); //if (res != VKFFT_SUCCESS) return res; //res = VkMovComplex(sc, regID[2], sc->locID[2]); //if (res != VKFFT_SUCCESS) return res; uint64_t P = 3; uint64_t Q = 3; for (uint64_t i = 0; i < Q; i++) { res = VkMovComplex(sc, sc->locID[0], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 0; i < P; i++) { if (i > 0) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, sc->locID[1], regID[Q * i + 1], w, temp); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)-sin(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)sin(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, sc->locID[2], regID[Q * i + 2], w, temp); if (res != VKFFT_SUCCESS) return res; } else { res = VkMovComplex(sc, sc->locID[1], regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[2]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], regID[Q * i], regID[Q * i + 1]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[Q * i + 1], tf[0], regID[Q * i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], regID[Q * i + 2], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[Q * i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkShuffleComplexInv(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } } uint64_t permute2[9] = { 0,3,6,1,4,7,2,5,8 }; res = VkPermute(sc, permute2, 9, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; /*res = VkMovComplex(sc, temp, regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[1], regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[3], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[2], regID[4]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[4], temp); if (res != VKFFT_SUCCESS) return res;*/ for (uint64_t i = 0; i < 2; i++) { free(tf[i]); tf[i] = 0; } break; } case 10: { char* tf[5]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 5; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-0.5%s", LFending); sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending); sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending); sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending); sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending); for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, regID[i], regID[i], w, temp); if (res != VKFFT_SUCCESS) return res; } //important //res = VkMovComplex(sc, regID[1], sc->locID[1]); //if (res != VKFFT_SUCCESS) return res; uint64_t P = 5; uint64_t Q = 2; for (uint64_t i = 0; i < Q; i++) { res = VkMovComplex(sc, sc->locID[0], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[3], regID[i + Q], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], regID[i + 3 * Q], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[2], regID[i + 2 * Q], tf[0], regID[i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[i + 3 * Q], regID[i + 3 * Q], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[2]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[3], regID[i + 3 * Q], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 0; i < P; i++) { if (i > 0) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, temp, regID[Q * i + 1], w, 0); } else { res = VkMovComplex(sc, temp, regID[Q * i + 1]); if (res != VKFFT_SUCCESS) return res; } res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); if (res != VKFFT_SUCCESS) return res; } uint64_t permute2[10] = { 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 }; res = VkPermute(sc, permute2, 10, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 5; i++) { free(tf[i]); tf[i] = 0; } break; } case 11: { char* tf_x[20]; char* tf_y[20]; for (uint64_t i = 0; i < 10; i++) { tf_x[i] = (char*)malloc(sizeof(char) * 50); if (!tf_x[i]) { for (uint64_t j = 0; j < i; j++) { free(tf_x[j]); tf_x[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } tf_y[i] = (char*)malloc(sizeof(char) * 50); if (!tf_y[i]) { for (uint64_t j = 0; j < 10; j++) { free(tf_x[j]); tf_x[j] = 0; } for (uint64_t j = 0; j < i; j++) { free(tf_y[j]); tf_y[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf_x[0], "8.4125353283118116886306336876800e-01%s", LFending); sprintf(tf_x[1], "-9.5949297361449738990105129410324e-01%s", LFending); sprintf(tf_x[2], "-1.4231483827328514046015907335008e-01%s", LFending); sprintf(tf_x[3], "-6.5486073394528506407246543075118e-01%s", LFending); sprintf(tf_x[4], "4.1541501300188642567903264668505e-01%s", LFending); sprintf(tf_x[5], "8.4125353283118116886306336876800e-01%s", LFending); sprintf(tf_x[6], "-9.5949297361449738990105129410324e-01%s", LFending); sprintf(tf_x[7], "-1.4231483827328514046015907335008e-01%s", LFending); sprintf(tf_x[8], "-6.5486073394528506407246543075118e-01%s", LFending); sprintf(tf_x[9], "4.1541501300188642567903264668505e-01%s", LFending); if (stageAngle < 0) { sprintf(tf_y[0], "-5.4064081745559758210122047739077e-01%s", LFending); sprintf(tf_y[1], "2.8173255684142969773359373164556e-01%s", LFending); sprintf(tf_y[2], "-9.8982144188093273235937163967435e-01%s", LFending); sprintf(tf_y[3], "7.5574957435425828375808593451168e-01%s", LFending); sprintf(tf_y[4], "9.0963199535451837136413102968824e-01%s", LFending); sprintf(tf_y[5], "5.4064081745559758210122047739077e-01%s", LFending); sprintf(tf_y[6], "-2.8173255684142969773359373164556e-01%s", LFending); sprintf(tf_y[7], "9.8982144188093273235937163967435e-01%s", LFending); sprintf(tf_y[8], "-7.5574957435425828375808593451168e-01%s", LFending); sprintf(tf_y[9], "-9.0963199535451837136413102968824e-01%s", LFending); } else { sprintf(tf_y[0], "5.4064081745559758210122047739077e-01%s", LFending); sprintf(tf_y[1], "-2.8173255684142969773359373164556e-01%s", LFending); sprintf(tf_y[2], "9.8982144188093273235937163967435e-01%s", LFending); sprintf(tf_y[3], "-7.5574957435425828375808593451168e-01%s", LFending); sprintf(tf_y[4], "-9.0963199535451837136413102968824e-01%s", LFending); sprintf(tf_y[5], "-5.4064081745559758210122047739077e-01%s", LFending); sprintf(tf_y[6], "2.8173255684142969773359373164556e-01%s", LFending); sprintf(tf_y[7], "-9.8982144188093273235937163967435e-01%s", LFending); sprintf(tf_y[8], "7.5574957435425828375808593451168e-01%s", LFending); sprintf(tf_y[9], "9.0963199535451837136413102968824e-01%s", LFending); } for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); if (res != VKFFT_SUCCESS) return res; } res = VkMovComplex(sc, sc->locID[0], regID[0]); if (res != VKFFT_SUCCESS) return res; uint64_t permute[11] = { 0,1,2,4,8,5,10,9,7,3,6 }; res = VkPermute(sc, permute, 11, 0, 0, w); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 5; i++) { res = VkSubComplex_x(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_x(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_y(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex_y(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 5; i++) { res = VkAddComplex_x(sc, regID[0], regID[0], regID[i + 1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_y(sc, regID[0], regID[0], regID[i + 6]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 1; i < 6; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s=%s;\n", sc->locID[i], sc->locID[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 6; i < 11; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x=0;\n\ %s.y=0;\n", sc->locID[i], sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 5; i++) { for (uint64_t j = 0; j < 5; j++) { uint64_t id = ((10 - i) + j) % 10; res = VkFMA3Complex_const_w(sc, sc->locID[j + 1], sc->locID[j + 6], regID[i + 1], tf_x[id], tf_y[id], regID[i + 6], w); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 1; i < 6; i++) { res = VkSubComplex_x(sc, regID[i], sc->locID[i], sc->locID[i + 5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_y(sc, regID[i], sc->locID[i], sc->locID[i + 5]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 1; i < 6; i++) { res = VkAddComplex_x(sc, regID[i + 5], sc->locID[i], sc->locID[i + 5]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex_y(sc, regID[i + 5], sc->locID[i], sc->locID[i + 5]); if (res != VKFFT_SUCCESS) return res; } uint64_t permute2[11] = { 0,1,10,3,9,7,2,4,8,5,6 }; res = VkPermute(sc, permute2, 11, 1, regID, w); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 10; i++) { free(tf_x[i]); tf_x[i] = 0; free(tf_y[i]); tf_y[i] = 0; } //old version /*char* tf[50]; for (uint64_t i = 0; i < 20; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-1.1000000000000000000000000000000e+00%s", LFending); sprintf(tf[2], "2.5309761160595911633208743296564e-01%s", LFending); sprintf(tf[3], "-1.2882006107736785338602203410119e+00%s", LFending); sprintf(tf[4], "3.0463223966921237906291253239033e-01%s", LFending); sprintf(tf[5], "-3.9133961551191742689326247273129e-01%s", LFending); sprintf(tf[6], "-2.8710222533928502208766531111905e+00%s", LFending); sprintf(tf[7], "1.3749079866163838037351752063842e+00%s", LFending); sprintf(tf[8], "8.1717813534121219731787277851254e-01%s", LFending); sprintf(tf[9], "1.8007465064456784631374830496497e+00%s", LFending); sprintf(tf[10], "-8.5949297361449739085514920589048e-01%s", LFending); if (stageAngle < 0) { sprintf(tf[1], "3.3166247903553996989600705092016e-01%s", LFending); sprintf(tf[11], "-2.3734704547482796677115857164608e+00%s", LFending); sprintf(tf[12], "-2.4836393087493469078452790199663e-02%s", LFending); sprintf(tf[13], "4.7401701751282859786940093727026e-01%s", LFending); sprintf(tf[14], "7.4218392777061303888785914750770e-01%s", LFending); sprintf(tf[15], "1.4064733090946088811534764317912e+00%s", LFending); sprintf(tf[16], "-1.1913645521959481676788072945783e+00%s", LFending); sprintf(tf[17], "7.0808888503950306869683117838576e-01%s", LFending); sprintf(tf[18], "2.5890826061416793990588303131517e-01%s", LFending); sprintf(tf[19], "-4.9929922194110284983104008915689e-02%s", LFending); } else { sprintf(tf[1], "-3.3166247903553996989600705092016e-01%s", LFending); sprintf(tf[11], "2.3734704547482796677115857164608e+00%s", LFending); sprintf(tf[12], "2.4836393087493469078452790199663e-02%s", LFending); sprintf(tf[13], "-4.7401701751282859786940093727026e-01%s", LFending); sprintf(tf[14], "-7.4218392777061303888785914750770e-01%s", LFending); sprintf(tf[15], "-1.4064733090946088811534764317912e+00%s", LFending); sprintf(tf[16], "1.1913645521959481676788072945783e+00%s", LFending); sprintf(tf[17], "-7.0808888503950306869683117838576e-01%s", LFending); sprintf(tf[18], "-2.5890826061416793990588303131517e-01%s", LFending); sprintf(tf[19], "4.9929922194110284983104008915689e-02%s", LFending); } for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); if (res != VKFFT_SUCCESS) return res; } res = VkMovComplex(sc, sc->locID[0], regID[0]); if (res != VKFFT_SUCCESS) return res; uint64_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 }; res = VkPermute(sc, permute, 11, 0, 0, w); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 5; i++) { res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; } res = VkMovComplex(sc, sc->locID[1], regID[1]); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 4; i++) { res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[i + 3], regID[i + 1], regID[5]); if (res != VKFFT_SUCCESS) return res; } res = VkMovComplex(sc, sc->locID[2], regID[6]); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 4; i++) { res = VkAddComplex(sc, sc->locID[2], sc->locID[2], regID[i + 7]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[i + 7], regID[i + 6], regID[10]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[2], sc->locID[2], tf[1], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < 2; k++) { res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], sc->locID[k * 4 + 5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], sc->locID[k * 4 + 6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 3], sc->locID[k * 4 + 4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 5], sc->locID[k * 4 + 6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[1], regID[k * 4 + 3], regID[k * 4 + 4]); if (res != VKFFT_SUCCESS) return res; if (k == 0) { res = VkMulComplexNumber(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10]); if (res != VKFFT_SUCCESS) return res; } else { res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 5], regID[k * 4 + 5], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 6], regID[k * 4 + 6], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 6]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, regID[1], regID[0], regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[5], regID[1]); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 4; i++) { res = VkAddComplex(sc, sc->locID[i + 1], regID[1], regID[i + 3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[5], sc->locID[5], regID[i + 3]); if (res != VKFFT_SUCCESS) return res; } res = VkMovComplex(sc, sc->locID[10], regID[2]); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 4; i++) { res = VkAddComplex(sc, sc->locID[i + 6], regID[2], regID[i + 7]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[10], sc->locID[10], regID[i + 7]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 5; i++) { res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; } uint64_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 }; res = VkPermute(sc, permute2, 11, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 20; i++) { free(tf[i]); tf[i] = 0; } */ break; } case 12: { char* tf[2]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 2; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-0.5%s", LFending); sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, regID[i], regID[i], w, temp); if (res != VKFFT_SUCCESS) return res; } //important //res = VkMovComplex(sc, regID[1], sc->locID[1]); //if (res != VKFFT_SUCCESS) return res; //res = VkMovComplex(sc, regID[2], sc->locID[2]); //if (res != VKFFT_SUCCESS) return res; uint64_t P = 3; uint64_t Q = 4; for (uint64_t i = 0; i < Q; i++) { res = VkMovComplex(sc, sc->locID[0], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 0; i < P; i++) { for (uint64_t j = 0; j < Q; j++) { if (i > 0) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * j * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * j * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * j * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)sin(2 * i * j * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, regID[Q * i + j], regID[Q * i + j], w, temp); if (res != VKFFT_SUCCESS) return res; } } res = VkMovComplex(sc, temp, regID[Q * i + 2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[Q * i + 2], regID[Q * i], regID[Q * i + 2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[Q * i + 3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[Q * i + 3], regID[Q * i + 1], regID[Q * i + 3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[Q * i + 1], regID[Q * i + 1], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[Q * i + 1]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], regID[Q * i + 1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, regID[Q * i + 3]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, regID[Q * i + 3]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, regID[Q * i + 3]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, regID[Q * i + 3]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkSubComplex(sc, regID[Q * i + 3], regID[Q * i + 2], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[Q * i + 2], regID[Q * i + 2], temp); if (res != VKFFT_SUCCESS) return res; } uint64_t permute2[12] = { 0,4,8,2,6,10,1,5,9,3,7,11 }; res = VkPermute(sc, permute2, 12, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 2; i++) { free(tf[i]); tf[i] = 0; } break; } case 13: { char* tf_x[20]; char* tf_y[20]; for (uint64_t i = 0; i < 12; i++) { tf_x[i] = (char*)malloc(sizeof(char) * 50); if (!tf_x[i]) { for (uint64_t j = 0; j < i; j++) { free(tf_x[j]); tf_x[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } tf_y[i] = (char*)malloc(sizeof(char) * 50); if (!tf_y[i]) { for (uint64_t j = 0; j < 12; j++) { free(tf_x[j]); tf_x[j] = 0; } for (uint64_t j = 0; j < i; j++) { free(tf_y[j]); tf_y[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf_x[0], "8.8545602565320989587194927539215e-01%s", LFending); sprintf(tf_x[1], "-9.7094181742605202719252621701429e-01%s", LFending); sprintf(tf_x[2], "1.2053668025532305345994812592614e-01%s", LFending); sprintf(tf_x[3], "-7.4851074817110109868448578063216e-01%s", LFending); sprintf(tf_x[4], "-3.5460488704253562600274447824678e-01%s", LFending); sprintf(tf_x[5], "5.6806474673115580237845248512407e-01%s", LFending); sprintf(tf_x[6], "8.8545602565320989608878970988926e-01%s", LFending); sprintf(tf_x[7], "-9.7094181742605202719252621701429e-01%s", LFending); sprintf(tf_x[8], "1.2053668025532305324988395500707e-01%s", LFending); sprintf(tf_x[9], "-7.4851074817110109863027567200788e-01%s", LFending); sprintf(tf_x[10], "-3.5460488704253562600274447824678e-01%s", LFending); sprintf(tf_x[11], "5.6806474673115580248687270237262e-01%s", LFending); if (stageAngle < 0) { sprintf(tf_y[0], "-4.6472317204376854566250792943904e-01%s", LFending); sprintf(tf_y[1], "2.3931566428755776706062234626682e-01%s", LFending); sprintf(tf_y[2], "9.9270887409805399278096144088934e-01%s", LFending); sprintf(tf_y[3], "-6.6312265824079520232193704631918e-01%s", LFending); sprintf(tf_y[4], "9.3501624268541482344965776185575e-01%s", LFending); sprintf(tf_y[5], "8.2298386589365639468820687318917e-01%s", LFending); sprintf(tf_y[6], "4.6472317204376854531014222338126e-01%s", LFending); sprintf(tf_y[7], "-2.3931566428755776695220212901827e-01%s", LFending); sprintf(tf_y[8], "-9.9270887409805399283517154951362e-01%s", LFending); sprintf(tf_y[9], "6.6312265824079520243035726356773e-01%s", LFending); sprintf(tf_y[10], "-9.3501624268541482344965776185575e-01%s", LFending); sprintf(tf_y[11], "-8.2298386589365639457978665594062e-01%s", LFending); } else { sprintf(tf_y[0], "4.6472317204376854566250792943904e-01%s", LFending); sprintf(tf_y[1], "-2.3931566428755776706062234626682e-01%s", LFending); sprintf(tf_y[2], "-9.9270887409805399278096144088934e-01%s", LFending); sprintf(tf_y[3], "6.6312265824079520232193704631918e-01%s", LFending); sprintf(tf_y[4], "-9.3501624268541482344965776185575e-01%s", LFending); sprintf(tf_y[5], "-8.2298386589365639468820687318917e-01%s", LFending); sprintf(tf_y[6], "-4.6472317204376854531014222338126e-01%s", LFending); sprintf(tf_y[7], "2.3931566428755776695220212901827e-01%s", LFending); sprintf(tf_y[8], "9.9270887409805399283517154951362e-01%s", LFending); sprintf(tf_y[9], "-6.6312265824079520243035726356773e-01%s", LFending); sprintf(tf_y[10], "9.3501624268541482344965776185575e-01%s", LFending); sprintf(tf_y[11], "8.2298386589365639457978665594062e-01%s", LFending); } for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); if (res != VKFFT_SUCCESS) return res; } res = VkMovComplex(sc, sc->locID[0], regID[0]); if (res != VKFFT_SUCCESS) return res; uint64_t permute[13] = { 0, 1, 2, 4, 8, 3, 6, 12, 11, 9, 5, 10, 7 }; res = VkPermute(sc, permute, 13, 0, 0, w); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 6; i++) { res = VkSubComplex_x(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_x(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_y(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex_y(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 6; i++) { res = VkAddComplex_x(sc, regID[0], regID[0], regID[i + 1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_y(sc, regID[0], regID[0], regID[i + 7]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 1; i < 7; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s=%s;\n", sc->locID[i], sc->locID[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 7; i < 13; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x=0;\n\ %s.y=0;\n", sc->locID[i], sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 6; i++) { for (uint64_t j = 0; j < 6; j++) { uint64_t id = ((12 - i) + j) % 12; res = VkFMA3Complex_const_w(sc, sc->locID[j + 1], sc->locID[j + 7], regID[i + 1], tf_x[id], tf_y[id], regID[i + 7], w); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 1; i < 7; i++) { res = VkSubComplex_x(sc, regID[i], sc->locID[i], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex_y(sc, regID[i], sc->locID[i], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 1; i < 7; i++) { res = VkAddComplex_x(sc, regID[i + 6], sc->locID[i], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex_y(sc, regID[i + 6], sc->locID[i], sc->locID[i + 6]); if (res != VKFFT_SUCCESS) return res; } uint64_t permute2[13] = { 0,1,12,9,11,4,8,2,10,5,3,6,7 }; res = VkPermute(sc, permute2, 13, 1, regID, w); //if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 12; i++) { free(tf_x[i]); tf_x[i] = 0; free(tf_y[i]); tf_y[i] = 0; } for (uint64_t i = 0; i < 12; i++) { free(tf_x[i]); tf_x[i] = 0; free(tf_y[i]); tf_y[i] = 0; } //old version /*char* tf[20]; //char* tf2[4]; //char* tf2inv[4]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 20; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } //tf2[i] = (char*)malloc(sizeof(char) * 50); //tf2inv[i] = (char*)malloc(sizeof(char) * 50); } sprintf(tf[0], "-1.08333333333333333e+00%s", LFending); sprintf(tf[1], "-3.00462606288665890e-01%s", LFending); sprintf(tf[5], "1.00707406572753300e+00%s", LFending); sprintf(tf[6], "7.31245990975348148e-01%s", LFending); sprintf(tf[7], "-5.79440018900960419e-01%s", LFending); sprintf(tf[8], "5.31932498429674383e-01%s", LFending); sprintf(tf[9], "-5.08814921720397551e-01%s", LFending); sprintf(tf[10], "-7.70585890309231480e-03%s", LFending); if (stageAngle < 0) { sprintf(tf[2], "-7.49279330626139051e-01%s", LFending); sprintf(tf[3], "4.01002128321867324e-01%s", LFending); sprintf(tf[4], "1.74138601152135891e-01%s", LFending); sprintf(tf[11], "-2.51139331838956803e+00%s", LFending); sprintf(tf[12], "-1.82354640868242068e+00%s", LFending); sprintf(tf[13], "1.44497990902399609e+00%s", LFending); sprintf(tf[14], "-1.34405691517736958e+00%s", LFending); sprintf(tf[15], "-9.75932420775945109e-01%s", LFending); sprintf(tf[16], "7.73329778651104860e-01%s", LFending); sprintf(tf[17], "1.92772511678346858e+00%s", LFending); sprintf(tf[18], "1.39973941472918284e+00%s", LFending); sprintf(tf[19], "-1.10915484383755047e+00%s", LFending); } else { sprintf(tf[2], "7.49279330626139051e-01%s", LFending); sprintf(tf[3], "-4.01002128321867324e-01%s", LFending); sprintf(tf[4], "-1.74138601152135891e-01%s", LFending); sprintf(tf[11], "2.51139331838956803e+00%s", LFending); sprintf(tf[12], "1.82354640868242068e+00%s", LFending); sprintf(tf[13], "-1.44497990902399609e+00%s", LFending); sprintf(tf[14], "1.34405691517736958e+00%s", LFending); sprintf(tf[15], "9.75932420775945109e-01%s", LFending); sprintf(tf[16], "-7.73329778651104860e-01%s", LFending); sprintf(tf[17], "-1.92772511678346858e+00%s", LFending); sprintf(tf[18], "-1.39973941472918284e+00%s", LFending); sprintf(tf[19], "1.10915484383755047e+00%s", LFending); } for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); if (res != VKFFT_SUCCESS) return res; } res = VkMovComplex(sc, sc->locID[0], regID[0]); if (res != VKFFT_SUCCESS) return res; uint64_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 }; res = VkPermute(sc, permute, 13, 0, 0, w); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 6; i++) { res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 3; i++) { res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], sc->locID[i + 1], sc->locID[i + 4]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 4; i++) { res = VkAddComplex(sc, sc->locID[i + 1], regID[i * 3 + 1], regID[i * 3 + 2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[i * 2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], regID[i * 3 + 3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[2], sc->locID[2], tf[1]); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < 3; k++) { res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 3], sc->locID[k * 2 + 4]); if (k == 0) { res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; } else { res = VkMulComplexNumber(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, regID[k * 2 + 3], sc->locID[k * 2 + 3], regID[k * 2 + 4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 4], regID[k * 2 + 4]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, regID[9], sc->locID[9], sc->locID[11]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[10], sc->locID[10], sc->locID[12]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[11], sc->locID[9], sc->locID[10]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[12], sc->locID[11], sc->locID[12]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[1], regID[9], regID[10]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[9], sc->locID[9], tf[11], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[10], sc->locID[10], tf[12], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[11], regID[11], tf[13], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[11], sc->locID[11], tf[14], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[12], sc->locID[12], tf[15], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[12], regID[12], tf[16], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[9], regID[9], tf[17], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, regID[10], regID[10], tf[18], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[19], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[9], sc->locID[9], regID[9]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[11], sc->locID[11], regID[9]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[10], sc->locID[10], regID[10]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[12], sc->locID[12], regID[10]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[11], regID[11], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[12], regID[12], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[9], sc->locID[9], regID[11]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[10], sc->locID[10], regID[11]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[11], sc->locID[11], regID[12]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[12], sc->locID[12], regID[12]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[1], regID[0], regID[1]); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 4; i++) { res = VkAddComplex(sc, sc->locID[i * 3 + 1], regID[i + 1], regID[i * 2 + 5]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[i * 3 + 3], regID[i + 1], regID[i * 2 + 5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[i * 3 + 2], regID[i + 1], regID[i * 2 + 6]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[i * 3 + 3], sc->locID[i * 3 + 3], regID[i * 2 + 6]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 3; i++) { res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[i + 4], sc->locID[i + 1], sc->locID[i + 4]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[i + 1], regID[i + 1]); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < 6; i++) { res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]); if (res != VKFFT_SUCCESS) return res; } uint64_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6 }; res = VkPermute(sc, permute2, 13, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 20; i++) { free(tf[i]); tf[i] = 0; }*/ break; } case 14: { char* tf[8]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 8; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending); sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending); sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending); sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending); if (stageAngle < 0) { sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending); sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending); sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending); sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending); } else { sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending); sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending); sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending); sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending); } for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, regID[i], regID[i], w, temp); if (res != VKFFT_SUCCESS) return res; } //important //res = VkMovComplex(sc, regID[1], sc->locID[1]); //if (res != VKFFT_SUCCESS) return res; uint64_t P = 7; uint64_t Q = 2; for (uint64_t i = 0; i < Q; i++) { res = VkMovComplex(sc, sc->locID[0], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[5], regID[i + 5 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[6], regID[i + 6 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], sc->locID[1], sc->locID[6]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + Q], sc->locID[1], sc->locID[6]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + 4 * Q], sc->locID[4], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 5 * Q], sc->locID[4], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[5], regID[i + Q], regID[i + 3 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[i + 5 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[1], regID[i], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], sc->locID[0], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[2], regID[i], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[3], regID[i + 4 * Q], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[4], regID[i + 2 * Q], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i], regID[i + Q], regID[i + 5 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2 * Q], regID[i + 5 * Q], regID[i + 3 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4 * Q], regID[i + 3 * Q], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[i], regID[i], tf[5]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[i + 2 * Q], regID[i + 2 * Q], tf[6]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[7]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 5 * Q], regID[i + 4 * Q], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplexInv(sc, regID[i + 6 * Q], regID[i + 4 * Q], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + 4 * Q], regID[i], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], sc->locID[0], sc->locID[1]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + Q], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[4], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[1], regID[i], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[2], regID[i], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[3], regID[i], regID[i + 3 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], regID[i + 4 * Q], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[6], regID[i + 6 * Q], sc->locID[5]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[i + 5 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[3], sc->locID[6], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[5], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[2], sc->locID[5], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 5 * Q], sc->locID[3], sc->locID[6], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 6 * Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < P; i++) { if (i > 0) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, temp, regID[Q * i + 1], w, 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkMovComplex(sc, temp, regID[Q * i + 1]); if (res != VKFFT_SUCCESS) return res; } res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); if (res != VKFFT_SUCCESS) return res; } uint64_t permute2[14] = { 0,2,4,6,8,10,12,1,3,5,7,9,11,13 }; res = VkPermute(sc, permute2, 14, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 8; i++) { free(tf[i]); tf[i] = 0; } break; } case 15: { char* tf[5]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 5; i++) { tf[i] = (char*)malloc(sizeof(char) * 50); if (!tf[i]) { for (uint64_t j = 0; j < i; j++) { free(tf[j]); tf[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf[0], "-0.5%s", LFending); sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending); sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending); sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending); sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending); char* tf2[2]; //VkAppendLine(sc, " {\n"); for (uint64_t i = 0; i < 2; i++) { tf2[i] = (char*)malloc(sizeof(char) * 50); if (!tf2[i]) { for (uint64_t j = 0; j < i; j++) { free(tf2[j]); tf2[j] = 0; } return VKFFT_ERROR_MALLOC_FAILED; } } sprintf(tf2[0], "-0.5%s", LFending); sprintf(tf2[1], "-0.8660254037844386467637231707529%s", LFending); for (uint64_t i = radix - 1; i > 0; i--) { if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i == radix - 1) { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkMulComplex(sc, regID[i], regID[i], w, temp); if (res != VKFFT_SUCCESS) return res; } //important //res = VkMovComplex(sc, regID[1], sc->locID[1]); //if (res != VKFFT_SUCCESS) return res; uint64_t P = 5; uint64_t Q = 3; for (uint64_t i = 0; i < Q; i++) { res = VkMovComplex(sc, sc->locID[0], regID[i]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[3], regID[i + Q], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], regID[i + 3 * Q], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[i + 2 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[2], regID[i + 2 * Q], tf[0], regID[i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[i + 3 * Q], regID[i + 3 * Q], tf[1]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[2]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[3], regID[i + 3 * Q], sc->locID[4]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[i + 4 * Q]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 0; i < P; i++) { if (i > 0) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)-sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)sin(2 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, sc->locID[1], regID[Q * i + 1], w, temp); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)-sin(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, (double)cos(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, (double)sin(4 * i * double_PI / radix), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkMulComplex(sc, sc->locID[2], regID[Q * i + 2], w, temp); if (res != VKFFT_SUCCESS) return res; } else { res = VkMovComplex(sc, sc->locID[1], regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, sc->locID[2], regID[2]); if (res != VKFFT_SUCCESS) return res; } res = VkAddComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2]); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->locID[0], regID[Q * i], regID[Q * i + 1]); if (res != VKFFT_SUCCESS) return res; res = VkFMAComplex(sc, sc->locID[1], regID[Q * i + 1], tf2[0], regID[Q * i], sc->w); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, sc->locID[2], regID[Q * i + 2], tf2[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[Q * i], sc->locID[0]); if (res != VKFFT_SUCCESS) return res; if (stageAngle < 0) { res = VkShuffleComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplexInv(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } else { res = VkShuffleComplexInv(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; res = VkShuffleComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); if (res != VKFFT_SUCCESS) return res; } } uint64_t permute2[15] = { 0, 3, 6, 9, 12, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14 }; res = VkPermute(sc, permute2, 15, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < 5; i++) { free(tf[i]); tf[i] = 0; } for (uint64_t i = 0; i < 2; i++) { free(tf2[i]); tf2[i] = 0; } break; } case 16: { if (res != VKFFT_SUCCESS) return res; if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } for (uint64_t i = 0; i < 8; i++) { res = VkMulComplex(sc, temp, regID[i + 8], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 8], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.5%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 4; i++) { res = VkMulComplex(sc, temp, regID[i + 4], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 8; i < 12; i++) { res = VkMulComplex(sc, temp, regID[i + 4], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.25%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 2; i++) { res = VkMulComplex(sc, temp, regID[i + 2], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 4; i < 6; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 8; i < 10; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(iw.y, -iw.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(-iw.y, iw.x);\n\n", vecType); } for (uint64_t i = 12; i < 14; i++) { res = VkMulComplex(sc, temp, regID[i + 2], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 3 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 3 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.125%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.125%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.125%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 1; i++) { res = VkMulComplex(sc, temp, regID[i + 1], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 2; i < 3; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 4; i < 5; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 6; i < 7; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < 2; j++) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } } uint64_t permute2[16] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; res = VkPermute(sc, permute2, 16, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; /*res = VkMovComplex(sc, temp, regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[1], regID[8]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[8], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[2], regID[4]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[4], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[3], regID[12]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[12], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[5]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[5], regID[10]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[10], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[7]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[7], regID[14]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[14], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[11]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[11], regID[13]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[13], temp); if (res != VKFFT_SUCCESS) return res;*/ break; } case 32: { if (res != VKFFT_SUCCESS) return res; if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } for (uint64_t i = 0; i < 16; i++) { res = VkMulComplex(sc, temp, regID[i + 16], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 16], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.5%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 8; i++) { res = VkMulComplex(sc, temp, regID[i + 8], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 8], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 16; i < 24; i++) { res = VkMulComplex(sc, temp, regID[i + 8], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 8], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.25%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 4; i++) { res = VkMulComplex(sc, temp, regID[i + 4], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 8; i < 12; i++) { res = VkMulComplex(sc, temp, regID[i + 4], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 16; i < 20; i++) { res = VkMulComplex(sc, temp, regID[i + 4], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(iw.y, -iw.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(-iw.y, iw.x);\n\n", vecType); } for (uint64_t i = 24; i < 28; i++) { res = VkMulComplex(sc, temp, regID[i + 4], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 4], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 3 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 3 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.125%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.125%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.125%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 2; i++) { res = VkMulComplex(sc, temp, regID[i + 2], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 4; i < 6; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 8; i < 10; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 12; i < 14; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < 2; j++) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 16 + 8 * j; i < 18 + 8 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 20 + 8 * j; i < 22 + 8 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 2], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } } if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->LUT) { if (sc->useCoalescedLUTUploadToSM) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 4 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 4 * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.0625%s*angle);\n", w, cosDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.0625%s*angle);\n", w, sinDef, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(0.0625%s*angle);\n", w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, " %s.x=%s.x+1.0%s;\n", w, w, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s);\n", w, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } } for (uint64_t i = 0; i < 1; i++) { res = VkMulComplex(sc, temp, regID[i + 1], w, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); } for (uint64_t i = 2; i < 3; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, 0.70710678118654752440084436210485, LFending, w, 0.70710678118654752440084436210485, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 4; i < 5; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 6; i < 7; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < 2; j++) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 8), LFending, w, (double)sin((2 * j + 1) * double_PI / 8), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t j = 0; j < 4; j++) { if ((j == 1) || (j == 2)) { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((7 - 2 * j) * double_PI / 16), LFending, w, (double)sin((7 - 2 * j) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, (double)cos((2 * j + 1) * double_PI / 16), LFending, w, (double)sin((2 * j + 1) * double_PI / 16), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 16 + 4 * j; i < 17 + 4 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } if (stageAngle < 0) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, iw, temp); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 18 + 4 * j; i < 19 + 4 * j; i++) { res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, regID[i + 1], regID[i], temp); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, regID[i], regID[i], temp); if (res != VKFFT_SUCCESS) return res; } } uint64_t permute2[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 }; res = VkPermute(sc, permute2, 32, 1, regID, temp); if (res != VKFFT_SUCCESS) return res; /*res = VkMovComplex(sc, temp, regID[1]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[1], regID[16]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[16], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[2]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[2], regID[8]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[8], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[3]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[3], regID[24]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[24], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[5]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[5], regID[20]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[20], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[6]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[6], regID[12]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[12], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[7]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[7], regID[28]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[28], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[9]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[9], regID[18]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[18], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[11]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[11], regID[26]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[26], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[13]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[13], regID[22]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[22], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[15]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[15], regID[30]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[30], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[19]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[19], regID[25]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[25], temp); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, temp, regID[23]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[23], regID[29]); if (res != VKFFT_SUCCESS) return res; res = VkMovComplex(sc, regID[29], temp); if (res != VKFFT_SUCCESS) return res;*/ break; } } return res; } static inline VkFFTResult appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t sharedType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char sharedDefinitions[20] = ""; uint64_t vecSize = 1; uint64_t maxSequenceSharedMemory = 0; //uint64_t maxSequenceSharedMemoryPow2 = 0; if (!strcmp(floatType, "float")) { #if(VKFFT_BACKEND==0) sprintf(vecType, "vec2"); sprintf(sharedDefinitions, "shared"); #elif(VKFFT_BACKEND==1) sprintf(vecType, "float2"); sprintf(sharedDefinitions, "__shared__"); #elif(VKFFT_BACKEND==2) sprintf(vecType, "float2"); sprintf(sharedDefinitions, "__shared__"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(vecType, "float2"); sprintf(sharedDefinitions, "__local"); #elif(VKFFT_BACKEND==5) sprintf(vecType, "float2"); sprintf(sharedDefinitions, "shared"); #endif vecSize = 8; } if (!strcmp(floatType, "double")) { #if(VKFFT_BACKEND==0) sprintf(vecType, "dvec2"); sprintf(sharedDefinitions, "shared"); #elif(VKFFT_BACKEND==1) sprintf(vecType, "double2"); sprintf(sharedDefinitions, "__shared__"); #elif(VKFFT_BACKEND==2) sprintf(vecType, "double2"); sprintf(sharedDefinitions, "__shared__"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(vecType, "double2"); sprintf(sharedDefinitions, "__local"); #elif(VKFFT_BACKEND==5) sprintf(vecType, "double2"); sprintf(sharedDefinitions, "shared"); #endif vecSize = 16; } if (sc->useRaderMult) { sc->sharedMemSize -= sc->additionalRaderSharedSize * vecSize; sc->sharedMemSizePow2 -= sc->additionalRaderSharedSize * vecSize; } maxSequenceSharedMemory = sc->sharedMemSize / vecSize; //maxSequenceSharedMemoryPow2 = sc->sharedMemSizePow2 / vecSize; uint64_t additionalR2Cshared = 0; if ((sc->performR2C || ((sc->performDCT == 2) || ((sc->performDCT == 4) && ((sc->fftDim % 2) != 0)))) && (sc->mergeSequencesR2C) && (sc->axis_id == 0) && (!sc->performR2CmultiUpload)) { additionalR2Cshared = (sc->fftDim % 2 == 0) ? 2 : 1; if ((sc->performDCT == 2) || ((sc->performDCT == 4) && ((sc->fftDim % 2) != 0))) additionalR2Cshared = 1; } switch (sharedType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c + single_r2c { sc->resolveBankConflictFirstStages = 0; sc->sharedStrideBankConflictFirstStages = ((sc->fftDim > sc->numSharedBanks / 2) && ((sc->fftDim & (sc->fftDim - 1)) == 0)) ? (sc->fftDim / sc->registerBoost + additionalR2Cshared) * (sc->numSharedBanks / 2 + 1) / (sc->numSharedBanks / 2) : sc->fftDim / sc->registerBoost + additionalR2Cshared; sc->sharedStrideReadWriteConflict = ((sc->numSharedBanks / 2 <= sc->localSize[1])) ? sc->fftDim / sc->registerBoost + additionalR2Cshared + 1 : sc->fftDim / sc->registerBoost + additionalR2Cshared + (sc->numSharedBanks / 2) / sc->localSize[1]; if (sc->sharedStrideReadWriteConflict < (sc->fftDim / sc->registerBoost + additionalR2Cshared)) sc->sharedStrideReadWriteConflict = sc->fftDim / sc->registerBoost + additionalR2Cshared; if (sc->useRaderFFT) { uint64_t max_stride = sc->fftDim / sc->registerBoost + additionalR2Cshared; uint64_t max_shift = 0; for (uint64_t i = 0; i < sc->numRaderPrimes; i++) { for (uint64_t j = 0; j < sc->raderContainer[i].numStages; j++) { if (sc->raderContainer[i].containerFFTNum < 8) { uint64_t subLogicalGroupSize = (uint64_t)ceil(sc->raderContainer[i].containerFFTDim / (double)sc->raderContainer[i].registers_per_thread_per_radix[sc->raderContainer[i].stageRadix[j]]); // hopefully it is not <1, will fix uint64_t shift = (subLogicalGroupSize > (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2) : 0; if (j == 0) shift = (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2)) ? 0 : 1; uint64_t loc_stride = sc->raderContainer[i].containerFFTDim + shift; if (sc->raderContainer[i].containerFFTNum * (loc_stride + 1) > max_stride) { max_stride = sc->raderContainer[i].containerFFTNum * (loc_stride + 1); if (shift > max_shift) max_shift = shift; } } } } sc->sharedShiftRaderFFT = max_shift; sc->sharedStrideRaderFFT = max_stride; } sc->maxSharedStride = (sc->sharedStrideBankConflictFirstStages < sc->sharedStrideReadWriteConflict) ? sc->sharedStrideReadWriteConflict : sc->sharedStrideBankConflictFirstStages; if (sc->useRaderFFT) sc->maxSharedStride = (sc->maxSharedStride < sc->sharedStrideRaderFFT) ? sc->sharedStrideRaderFFT : sc->maxSharedStride; sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride; sc->maxSharedStride = ((sc->sharedMemSize < sc->usedSharedMemory)) ? sc->fftDim / sc->registerBoost + additionalR2Cshared : sc->maxSharedStride; sc->sharedStrideBankConflictFirstStages = (sc->maxSharedStride == (sc->fftDim / sc->registerBoost + additionalR2Cshared)) ? sc->fftDim / sc->registerBoost + additionalR2Cshared : sc->sharedStrideBankConflictFirstStages; sc->sharedStrideReadWriteConflict = (sc->maxSharedStride == (sc->fftDim / sc->registerBoost + additionalR2Cshared)) ? sc->fftDim / sc->registerBoost + additionalR2Cshared : sc->sharedStrideReadWriteConflict; if (sc->useRaderFFT) { sc->sharedStrideRaderFFT = (sc->maxSharedStride == (sc->fftDim / sc->registerBoost + additionalR2Cshared)) ? sc->fftDim / sc->registerBoost + additionalR2Cshared : sc->sharedStrideRaderFFT; sc->sharedShiftRaderFFT = (sc->maxSharedStride == (sc->fftDim / sc->registerBoost + additionalR2Cshared)) ? 0 : sc->sharedShiftRaderFFT; } //sc->maxSharedStride += mergeR2C; //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", sc->maxSharedStride, sc->sharedStrideBankConflictFirstStages, sc->sharedStrideReadWriteConflict, sc->localSize[1], sc->fftDim); sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->sharedStrideReadWriteConflict); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride; if (sc->useRaderMult) { for (uint64_t i = 0; i < 20; i++) { sc->RaderKernelOffsetShared[i] += sc->usedSharedMemory / vecSize; } sc->usedSharedMemory += sc->additionalRaderSharedSize * vecSize; } #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride); sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType); #elif(VKFFT_BACKEND==2) //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride); sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c + single_c2c_strided { uint64_t shift = (sc->fftDim < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim : 1; sc->sharedStrideReadWriteConflict = ((sc->axisSwapped) && ((sc->localSize[0] % 4) == 0)) ? sc->localSize[0] + shift : sc->localSize[0]; sc->maxSharedStride = ((maxSequenceSharedMemory < sc->sharedStrideReadWriteConflict* (sc->fftDim / sc->registerBoost + additionalR2Cshared))) ? sc->localSize[0] : sc->sharedStrideReadWriteConflict; sc->sharedStrideReadWriteConflict = (sc->maxSharedStride == sc->localSize[0]) ? sc->localSize[0] : sc->sharedStrideReadWriteConflict; sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->maxSharedStride); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->usedSharedMemory = vecSize * sc->maxSharedStride * (sc->fftDim / sc->registerBoost + additionalR2Cshared); if (sc->useRaderMult) { for (uint64_t i = 0; i < 20; i++) { sc->RaderKernelOffsetShared[i] += sc->usedSharedMemory / vecSize; } sc->usedSharedMemory += sc->additionalRaderSharedSize * vecSize; } #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost); sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType); #elif(VKFFT_BACKEND==2) //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost); sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif break; } } if (sc->useRaderMult) { sc->sharedMemSize += sc->additionalRaderSharedSize * vecSize; sc->sharedMemSizePow2 += sc->additionalRaderSharedSize * vecSize; } return res; } static inline VkFFTResult appendInitialization(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t initType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char uintType_32[30]; #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); sprintf(uintType_32, "uint"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(uintType_32, "unsigned int"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(uintType_32, "unsigned int"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(uintType_32, "unsigned int"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(uintType_32, "uint"); #endif //sc->tempLen = sprintf(sc->tempStr, " uint dum=gl_LocalInvocationID.x;\n"); uint64_t logicalStoragePerThread = sc->registers_per_thread * sc->registerBoost; uint64_t logicalRegistersPerThread = sc->registers_per_thread; if (sc->convolutionStep) { for (uint64_t i = 0; i < sc->registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 1; j < sc->matrixConvolution; j++) { for (uint64_t i = 0; i < sc->registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 "_%" PRIu64 ";\n", vecType, i, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 "_%" PRIu64 ".x=0;\n", i, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 "_%" PRIu64 ".y=0;\n", i, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { for (uint64_t i = 0; i < sc->registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //sc->tempLen = sprintf(sc->tempStr, " uint dum=gl_LocalInvocationID.y;//gl_LocalInvocationID.x/gl_WorkGroupSize.x;\n"); //sc->tempLen = sprintf(sc->tempStr, " dum=dum/gl_LocalInvocationID.x-1;\n"); //sc->tempLen = sprintf(sc->tempStr, " dummy=dummy/gl_LocalInvocationID.x-1;\n"); sc->regIDs = (char**)malloc(sizeof(char*) * logicalStoragePerThread); if (!sc->regIDs) return VKFFT_ERROR_MALLOC_FAILED; for (uint64_t i = 0; i < logicalStoragePerThread; i++) { sc->regIDs[i] = (char*)malloc(sizeof(char) * 50); if (!sc->regIDs[i]) { for (uint64_t j = 0; j < i; j++) { free(sc->regIDs[j]); sc->regIDs[j] = 0; } free(sc->regIDs); sc->regIDs = 0; return VKFFT_ERROR_MALLOC_FAILED; } if (i < logicalRegistersPerThread) sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i); else sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i); //sprintf(sc->regIDs[i], "%" PRIu64 "[%" PRIu64 "]", i / logicalRegistersPerThread, i % logicalRegistersPerThread); //sprintf(sc->regIDs[i], "s[%" PRIu64 "]", i - logicalRegistersPerThread); } if (sc->registerBoost > 1) { //sc->tempLen = sprintf(sc->tempStr, " %s sort0;\n", vecType); //sc->tempLen = sprintf(sc->tempStr, " %s temps[%" PRIu64 "];\n", vecType, (sc->registerBoost -1)* logicalRegistersPerThread); for (uint64_t i = 1; i < sc->registerBoost; i++) { //sc->tempLen = sprintf(sc->tempStr, " %s temp%" PRIu64 "[%" PRIu64 "];\n", vecType, i, logicalRegistersPerThread); for (uint64_t j = 0; j < sc->registers_per_thread; j++) { sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, j + i * sc->registers_per_thread); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", j + i * sc->registers_per_thread); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", j + i * sc->registers_per_thread); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ for(uint i=0; i<%" PRIu64 "; i++)\n\ temp%" PRIu64 "[i]=%s(dum, dum);\n", logicalRegistersPerThread, i, vecType);*/ } } sc->tempLen = sprintf(sc->tempStr, " %s w;\n", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " w.x=0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " w.y=0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->w, "w"); uint64_t maxNonPow2Radix = sc->maxNonPow2Radix; for (uint64_t i = 0; i < sc->usedLocRegs; i++) { sprintf(sc->locID[i], "loc_%" PRIu64 "", i); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->locID[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(sc->temp, "%s", sc->locID[0]); if (sc->useRaderFFT) { for (uint64_t i = 0; i < 2; i++) { sprintf(sc->x0[i], "x0_%" PRIu64 "", i); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->x0[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->x0[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->x0[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->useRaderMult) { int64_t rader_fft_regs = (sc->useRaderFFT) ? 2 : 0; int64_t rader_mult_regs = sc->raderRegisters / 2 - rader_fft_regs; if (rader_mult_regs <= (int64_t)sc->usedLocRegs - 1) { for (int64_t i = 0; i < rader_mult_regs; i++) { sprintf(sc->x0[i + rader_fft_regs], "%s", sc->locID[i + 1]); } } else { for (int64_t i = 0; i < (int64_t)sc->usedLocRegs - 1; i++) { sprintf(sc->x0[i + rader_fft_regs], "%s", sc->locID[i + 1]); } for (int64_t i = sc->usedLocRegs - 1; i < rader_mult_regs; i++) { sprintf(sc->x0[i + rader_fft_regs], "x0_%" PRIu64 "", i + rader_fft_regs); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->x0[i + rader_fft_regs]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->x0[i + rader_fft_regs]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->x0[i + rader_fft_regs]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } /*#if(VKFFT_BACKEND==2) sprintf(sc->temp2, "temp2"); sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->temp2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->temp2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->temp2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif*/ } //sc->tempLen = sprintf(sc->tempStr, " %s temp2;\n", vecType); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; uint64_t useRadix8plus = 0; for (uint64_t i = 0; i < sc->numStages; i++) if ((sc->stageRadix[i] == 8) || (sc->stageRadix[i] == 16) || (sc->stageRadix[i] == 32) || (sc->useRaderFFT)) useRadix8plus = 1; if (useRadix8plus == 1) { if (maxNonPow2Radix > 1) sprintf(sc->iw, "%s", sc->locID[1]); else { sc->tempLen = sprintf(sc->tempStr, " %s iw;\n", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " iw.x=0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " iw.y=0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->iw, "iw"); } } //sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->tempReg); sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->stageInvocationID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->blockInvocationID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((sc->fftDim < sc->fft_dim_full) || (initType == 1) || (initType == 2)) { sc->tempLen = sprintf(sc->tempStr, " %s disableThreads=1;\n", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //initialize subgroups ids if (sc->useRader) { sc->tempLen = sprintf(sc->tempStr, " %s %s = 0;\n", uintType, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s = 0;\n", uintType, sc->raderIDx2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; /*#if((VKFFT_BACKEND==1)||(VKFFT_BACKEND==2)) sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); if (sc->localSize[1] == 1) { sc->tempLen = sprintf(sc->tempStr, " %s %s=(threadIdx.x %% %" PRIu64 ");\n", uintType, sc->gl_SubgroupInvocationID, sc->warpSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s=(threadIdx.x / %" PRIu64 ");\n", uintType, sc->gl_SubgroupID, sc->warpSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s %s=((threadIdx.x+threadIdx.y*blockDim.x) %% %" PRIu64 ");\n", uintType, sc->gl_SubgroupInvocationID, sc->warpSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s %s=((threadIdx.x+threadIdx.y*blockDim.x) / %" PRIu64 ");\n", uintType, sc->gl_SubgroupID, sc->warpSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif*/ } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " %s LUTId=0;\n", uintType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->LUT_4step) { sc->tempLen = sprintf(sc->tempStr, " %s angle=0;\n", floatType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " %s angle=0;\n", floatType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT)) { sc->tempLen = sprintf(sc->tempStr, " %s mult;\n", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->cacheShuffle) { sc->tempLen = sprintf(sc->tempStr, "\ %s tshuffle= ((%s>>1))%%(%" PRIu64 ");\n\ %s shuffle[%" PRIu64 "];\n", uintType, sc->gl_LocalInvocationID_x, sc->registers_per_thread, vecType, sc->registers_per_thread); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < sc->registers_per_thread; i++) { /*sc->tempLen = sprintf(sc->tempStr, "\ shuffle[%" PRIu64 "];\n", i, vecType);*/ sc->tempLen = sprintf(sc->tempStr, " shuffle[%" PRIu64 "].x = 0;\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " shuffle[%" PRIu64 "].y = 0;\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } return res; } static inline VkFFTResult appendZeropadStart(VkFFTSpecializationConstantsLayout* sc) { //return if sequence is full of zeros from the start VkFFTResult res = VKFFT_SUCCESS; if ((sc->frequencyZeropadding)) { switch (sc->axis_id) { case 0: { break; } case 1: { if (!sc->supportAxis) { char idX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } break; } case 2: { if (!sc->supportAxis) { char idY[500] = ""; if (sc->performWorkGroupShift[1])//y axis is along z workgroup here sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z); else sprintf(idY, "%s", sc->gl_GlobalInvocationID_z); char idX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { char idY[500] = ""; if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idY, "%s", sc->gl_GlobalInvocationID_x); if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } break; } } } else { switch (sc->axis_id) { case 0: { char idY[500] = ""; uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->axisSwapped) { if (mult != 1) { if (sc->performWorkGroupShift[1]) sprintf(idY, "((%s + (%s + consts.workGroupShiftY) * %" PRIu64 ")* %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0], mult); else sprintf(idY, "((%s + %s * %" PRIu64 ")*%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0], mult); } else { if (sc->performWorkGroupShift[1]) sprintf(idY, "(%s + (%s + consts.workGroupShiftY) * %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]); else sprintf(idY, "(%s + %s * %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]); } char idZ[500] = ""; if (sc->performWorkGroupShift[2]) sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]); else sprintf(idZ, "%s %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]); if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (mult != 1) { if (sc->performWorkGroupShift[1]) sprintf(idY, "((%s + consts.workGroupShiftY * %s)* %" PRIu64 ")", sc->gl_GlobalInvocationID_y, sc->gl_WorkGroupSize_y, mult); else sprintf(idY, "(%s* %" PRIu64 ")", sc->gl_GlobalInvocationID_y, mult); } else { if (sc->performWorkGroupShift[1]) sprintf(idY, "(%s + consts.workGroupShiftY * %s)", sc->gl_GlobalInvocationID_y, sc->gl_WorkGroupSize_y); else sprintf(idY, "%s", sc->gl_GlobalInvocationID_y); } char idZ[500] = ""; if (sc->performWorkGroupShift[2]) sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]); else sprintf(idZ, "%s %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]); if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } break; } case 1: { char idZ[500] = ""; if (sc->performWorkGroupShift[2]) sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]); else sprintf(idZ, "%s %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]); if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 2: { break; } } } return res; } static inline VkFFTResult appendZeropadEnd(VkFFTSpecializationConstantsLayout* sc) { //return if sequence is full of zeros from the start VkFFTResult res = VKFFT_SUCCESS; if ((sc->frequencyZeropadding)) { switch (sc->axis_id) { case 0: { break; } case 1: { if (!sc->supportAxis) { char idX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } break; } case 2: { if (!sc->supportAxis) { char idY[500] = ""; if (sc->performWorkGroupShift[1])//y axis is along z workgroup here sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z); else sprintf(idY, "%s", sc->gl_GlobalInvocationID_z); char idX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { char idY[500] = ""; if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idY, "%s", sc->gl_GlobalInvocationID_x); if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } break; } } } else { switch (sc->axis_id) { case 0: { //char idY[500] = ""; if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 1: { char idZ[500] = ""; if (sc->performWorkGroupShift[2]) sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z); else sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z); if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 2: { break; } } } return res; } static inline VkFFTResult appendZeropadStartReadWriteStage(VkFFTSpecializationConstantsLayout* sc, uint64_t readStage) { //return if sequence is full of zeros from the start VkFFTResult res = VKFFT_SUCCESS; if ((sc->frequencyZeropadding)) { switch (sc->axis_id) { case 0: { break; } case 1: { if (!sc->supportAxis) { char idX[500] = ""; if (readStage) { sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->inputStride[1]); } else { sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->outputStride[1]); } if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } break; } case 2: { if (!sc->supportAxis) { char idY[500] = ""; char idX[500] = ""; if (readStage) { sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]); sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->inputStride[1]); } else { sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]); sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->outputStride[1]); } if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { char idY[500] = ""; if (readStage) { sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]); } else { sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]); } if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } break; } } } else { switch (sc->axis_id) { case 0: { char idY[500] = ""; char idZ[500] = ""; //uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (readStage) { sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]); sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[2], sc->inputStride[3] / sc->inputStride[2]); } else { sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]); sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[2], sc->outputStride[3] / sc->outputStride[2]); } if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 1: { char idZ[500] = ""; if (readStage) { sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[2], sc->inputStride[3] / sc->inputStride[2]); } else { sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[2], sc->outputStride[3] / sc->outputStride[2]); } if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 2: { break; } } } return res; } static inline VkFFTResult appendZeropadEndReadWriteStage(VkFFTSpecializationConstantsLayout* sc) { //return if sequence is full of zeros from the start VkFFTResult res = VKFFT_SUCCESS; if ((sc->frequencyZeropadding)) { switch (sc->axis_id) { case 0: { break; } case 1: { char idX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 2: { if (sc->performZeropaddingFull[0]) { if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } } } else { switch (sc->axis_id) { case 0: { if (sc->performZeropaddingFull[1]) { if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 1: { if (sc->performZeropaddingFull[2]) { if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 2: { break; } } } return res; } static inline VkFFTResult appendSetSMToZero(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) { VkFFTResult res = VKFFT_SUCCESS; uint64_t used_registers_read = 1; switch (readType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); break; case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); break; } if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; //appendZeropadStart(sc); for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { switch (readType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://single_c2c { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } } } } //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult setReadToRegisters(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) { VkFFTResult res = VKFFT_SUCCESS; switch (readType) { case 0: //single_c2c { if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->actualInverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) sc->readToRegisters = 0; else sc->readToRegisters = 1; break; } case 1: //grouped_c2c { if ((sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) sc->readToRegisters = 0; else sc->readToRegisters = 1; break; } case 2: //single_c2c_strided { if ((sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) sc->readToRegisters = 0; else sc->readToRegisters = 1; break; } case 5://single_r2c { if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) sc->readToRegisters = 0; else sc->readToRegisters = 1; break; } case 6: //single_c2r { if ((sc->rader_generator[0] > 0) || ((sc->fftDim % sc->localSize[0]) && (!sc->axisSwapped)) || ((sc->fftDim % sc->localSize[1]) && (sc->axisSwapped))) sc->readToRegisters = 0; else sc->readToRegisters = 1; break; } case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: { sc->readToRegisters = 0; break; } case 144: { uint64_t registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1; if ((sc->rader_generator[0] > 0) || ((sc->fftDim / registers_first_stage) != sc->localSize[0])) sc->readToRegisters = 0; else sc->readToRegisters = 1; break; } case 145: { uint64_t registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1; if ((sc->rader_generator[0] > 0) || ((sc->fftDim / registers_first_stage) != sc->localSize[1])) sc->readToRegisters = 0; else sc->readToRegisters = 1; break; } } return res; } static inline VkFFTResult appendReadDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) { VkFFTResult res = VKFFT_SUCCESS; long double double_PI = 3.14159265358979323846264338327950288419716939937510L; char vecType[30]; char inputsStruct[20] = ""; char LFending[4] = ""; char uintType_32[30]; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (sc->inputBufferBlockNum == 1) sprintf(inputsStruct, "inputs"); else sprintf(inputsStruct, ".inputs"); if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); sprintf(uintType_32, "uint"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); sprintf(uintType_32, "unsigned int"); sprintf(inputsStruct, "inputs"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); sprintf(uintType_32, "unsigned int"); sprintf(inputsStruct, "inputs"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(inputsStruct, "inputs"); sprintf(uintType_32, "unsigned int"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; #elif(VKFFT_BACKEND==5) sprintf(inputsStruct, "inputs"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); sprintf(uintType_32, "uint"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; #endif char convTypeLeft[20] = ""; char convTypeRight[20] = ""; if ((!strcmp(floatType, "float")) && (strcmp(floatTypeMemory, "float"))) { if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "float("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "float("); sprintf(convTypeRight, ")"); #endif } else { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "vec2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #endif } } if ((!strcmp(floatType, "double")) && (strcmp(floatTypeMemory, "double"))) { if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "double("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "double("); sprintf(convTypeRight, ")"); #endif } else { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "dvec2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #endif } } char index_x[2000] = ""; char index_y[2000] = ""; char requestCoordinate[100] = ""; if (sc->convolutionStep) { if (sc->matrixConvolution > 1) { sprintf(requestCoordinate, "coordinate"); } } char requestBatch[100] = ""; if (sc->convolutionStep) { if (sc->numKernels > 1) { sprintf(requestBatch, "0");//if one buffer - multiple kernel convolution } } //appendZeropadStart(sc); switch (readType) { case 0://single_c2c { //sc->tempLen = sprintf(sc->tempStr, " return;\n"); char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->axisSwapped) { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); } char shiftY2[100] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; if (sc->fftDim < sc->fft_dim_full) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (uint64_t)ceil(((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / (sc->firstStageStartSize / sc->fftDim)) / (double)used_registers_read), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim)); //sc->tempLen = sprintf(sc->tempStr, " if (numActiveThreads>%" PRIu64 ") numActiveThreads = %" PRIu64 ";\n", sc->localSize[0]* sc->localSize[1], sc->localSize[0]* sc->localSize[1]); //sprintf(sc->disableThreadsStart, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); } else { sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); } } else { sc->tempLen = sprintf(sc->tempStr, " { \n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->fftDim == sc->fft_dim_full) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { #if (VKFFT_BACKEND!=2) //AMD compiler fix sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1], sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { #if (VKFFT_BACKEND!=2) //AMD compiler fix sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1], sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); } else { if (sc->axisSwapped) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); } } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { /* if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); */ if (sc->axisSwapped) { if ((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / used_registers_read / (sc->firstStageStartSize / sc->fftDim) > sc->localSize[0]) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read)); } else { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read)); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read)); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { //not used if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->fftDim % sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->fftDim % sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 1://grouped_c2c { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); sc->tempLen = sprintf(sc->tempStr, " disableThreads = (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->readToRegisters) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 2://single_c2c_strided { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); //sc->tempLen = sprintf(sc->tempStr, " if(gl_GlobalInvolcationID.x%s >= %" PRIu64 ") return; \n", shiftX, sc->size[0] / axis->specializationConstants.fftDim); sc->tempLen = sprintf(sc->tempStr, " disableThreads = (((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim, sc->fft_dim_full); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->readToRegisters) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 5://single_r2c { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->fftDim == sc->fft_dim_full) { uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[(%s + %" PRIu64 ")]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, sc->inputStride[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[(%s + %" PRIu64 ")/ %" PRIu64 "]%s[(%s + %" PRIu64 ") %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); else sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->axisSwapped) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride+ (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { //Not implemented } break; } case 6: {//single_c2r //sc->tempLen = sprintf(sc->tempStr, " return;\n"); char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[100] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); if (sc->fftDim < sc->fft_dim_full) { //not implemented if (sc->axisSwapped) sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full); else sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); } else { sc->tempLen = sprintf(sc->tempStr, " { \n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]); //num_in =(uint64_t)ceil(num_in / (double)sc->min_registers_per_thread); for (uint64_t i = 0; i < num_in; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->inputStride[0], sc->fftDim / 2 + 1, sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { #if (VKFFT_BACKEND!=2) //AMD compiler fix sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[0], sc->size[sc->axis_id + 1], sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif } if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { #if (VKFFT_BACKEND!=2) //AMD compiler fix sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[1], sc->size[sc->axis_id + 1], sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif } if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (0) { //not enabled if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!sc->axisSwapped) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (0) { //not enabled sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) { if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)))) { if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } else { if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) { if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)))) { if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { if (sc->axisSwapped) { if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) { if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)))) { if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } else { if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) { if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)))) { if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } } } //sc->readToRegisters = 1; if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_dim_full; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } if (!sc->readToRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].x = %s.x;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].y = %s.y;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } } else { } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 110://DCT-I nonstrided { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; } sc->fftDim = (sc->fftDim + 2) / 2; uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[0]); for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < num_in; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } sc->fftDim = 2 * sc->fftDim - 2; if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; } else { //Not implemented } break; } case 111://DCT-I strided { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftX2[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; } sc->fftDim = (sc->fftDim + 2) / 2; uint64_t num_in = (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]); for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < num_in; i++) { //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) sc->tempLen = sprintf(sc->tempStr, " //sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult); else sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]); } else { sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); } res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } sc->fftDim = 2 * sc->fftDim - 2; if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; } else { //Not implemented } break; } case 120://DCT-II nonstrided { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_dim_full; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } } else { //Not implemented } break; } case 121://DCT-II strided { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftX2[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < mult * used_registers_read; i++) { //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1]); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + mult * k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult, sc->localSize[0], mult); else sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + %s;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * used_registers_read) * sc->localSize[1]); } else { sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); } res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + mult * k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_dim_full; used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } } else { //Not implemented } break; } case 130://DCT-III nonstrided { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; } uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]); for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < num_in; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]); } else { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[sc->axis_id + 1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[sc->axis_id + 1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[inoutID]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[inoutID]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; } else { //Not implemented } break; } case 131://DCT-III strided { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftX2[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < num_in; i++) { //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1)) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]); } else { sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); } res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) { } else { sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID];\n", sc->startDCT3LUT); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID) );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID) );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID) );\n", (double)(double_PI / 2 / sc->fftDim), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f - %%f \\n\", mult.x, mult.y);\n"); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) sc->tempLen = sprintf(sc->tempStr, " //sdataID = (combinedID) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult); else sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID) * sharedStride + %s;\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (combinedID > 0){\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); sprintf(index_y, "(%" PRIu64 " - (%s/%" PRIu64 " + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]); } else { sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%" PRIu64 " - (%s + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); } res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) { } else { sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID) * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = -((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x)*mult.x-(%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.y)*mult.x+(%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; } else { //Not implemented } break; } case 140://DCT-IV nonstrided cast to 8x FFT { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->axisSwapped) { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); } char shiftY2[100] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); if (sc->fftDim < sc->fft_dim_full) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / sc->min_registers_per_thread / (sc->firstStageStartSize / sc->fftDim), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full); sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); } else { sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full); res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); } } else { sc->tempLen = sprintf(sc->tempStr, " { \n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->fftDim == sc->fft_dim_full) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->inputStride[0], sc->fftDim / 8, sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->fftDim / 8, sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[2*(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[2*(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(2*(combinedID %% %" PRIu64 ")+1) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } /*else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { if (sc->axisSwapped) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } }*/ sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 141://DCT-IV strided cast to 8x FFT { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); if (sc->fftDim != sc->fft_dim_full) { sprintf(sc->disableThreadsStart, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); } else { sprintf(sc->disableThreadsStart, "{\n"); res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; sprintf(sc->disableThreadsEnd, "}"); } sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { if (sc->fftDim == sc->fft_dim_full) sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(inoutID < %" PRIu64 "){\n", sc->fftDim / 8); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(2*(%s+%" PRIu64 ")+1)+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 + 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 142://DCT-IV nonstrided as 2xN/2 DCT-II { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } uint64_t maxBluesteinCutOff = 1; if (sc->zeropadBluestein[0]) { if (sc->axisSwapped) maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[0]; else maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[1]; } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < 2 * used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; #if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are not handling barrier with thread-conditional writes to local memory - so this is a work-around if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else if (i < used_registers_read) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[%s]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif #if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are not handling barrier with thread-conditional writes to local memory - so this is a work-around if (sc->axisSwapped) { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); } else { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); } else { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5))//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < 2 * used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); } else { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (i < used_registers_read) { sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); } else { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < 2 * used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); } else { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (i < used_registers_read) { sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); } else { //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } #endif res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { if (sc->axisSwapped) maxBluesteinCutOff = sc->fftDim * sc->localSize[0]; else maxBluesteinCutOff = sc->fftDim * sc->localSize[1]; } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-sharedStride].y;\n", sc->w); } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-1].y;\n", sc->w); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[sdataID].x;\n", sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim - 1, sc->fftDim); } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim - 1, sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", %s.x, %s.y, %s);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are not handling barrier with thread-conditional writes to local memory - so this is a work-around if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim); } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)) res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim); } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; #endif res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]); for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < num_in; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", (double)(double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = sdata[inoutID];\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } \n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " == 0){\n", sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_dim_full; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } } else { //Not implemented } break; } case 143://DCT-IV strided as 2xN/2 DCT-II { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftX2[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < 2 * used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]); res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; #if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are not handling barrier with thread-conditional writes to local memory - so this is a work-around if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #else if (i < used_registers_read) { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[%s]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif #if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are not handling barrier with thread-conditional writes to local memory - so this is a work-around sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5))//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < 2 * used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]); res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (i < used_registers_read) { sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < 2 * used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]); res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (i < used_registers_read) { sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } #endif res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-sharedStride].y;\n", sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[sdataID].x;\n", sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim - 1, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f\\n\", %s.x, %s.y);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)))//OpenCL, Level Zero and Metal are not handling barrier with thread-conditional writes to local memory - so this is a work-around sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #endif sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; #if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)||(VKFFT_BACKEND==5)) res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; #endif res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]); for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < num_in; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT3LUT, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID / %" PRIu64 ") );\n", cosDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID / %" PRIu64 ") );\n", sinDef, (double)(double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID / %" PRIu64 ") );\n", (double)(double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[inoutID];\n", sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " } else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_dim_full; used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } } else { //Not implemented } break; } case 144://odd DCT-IV nonstrided as N FFT { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * (combinedID %% %" PRIu64 ");\n", sc->fftDim / 2, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID + %s * sharedStride;\n", sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ %s.x = -%s.x;\n\ %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ %s.x = -%s.x;\n\ %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ %s.x = -%s.x;\n\ %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ %s.x = -%s.x;\n\ %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_dim_full; used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } if (!sc->readToRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if (sc->axisSwapped) { if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].x = %s.x;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].y = %s.y;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } } else { //Not implemented } break; } case 145://odd DCT-IV strided as N FFT { char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftX2[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[0]) { res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < mult * used_registers_read; i++) { //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1]); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->mergeSequencesR2C) sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * used_registers_read) * sc->localSize[1]); } else { sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); } res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 1); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[0]) { sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ %s.x = -%s.x;\n\ %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ %s.x = -%s.x;\n\ %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[0]) { sc->fftDim = sc->fft_dim_full; used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } if (!sc->readToRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_read; i++) { if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } } else { //Not implemented } break; } } return res; } static inline VkFFTResult appendReorder4StepRead(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif uint64_t logicalRegistersPerThread = (sc->rader_generator[0] > 0) ? sc->min_registers_per_thread : sc->registers_per_thread_per_radix[sc->stageRadix[0]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; switch (reorderType) { case 1: {//grouped_c2c char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); if ((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)) { if (!sc->readToRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } /*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->readToRegisters = 0; } else sc->readToRegisters = 1;*/ res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; if ((sc->LUT) && (sc->LUT_4step)) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " angle = 2 * %.17e%s * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %.17e%s;\n", 3.1415926535897932384626433832795, LFending, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, "\ w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = w.x;\n", sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].x = w.x;\n", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } break; } case 2: {//single_c2c_strided char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); if ((!sc->reorderFourStep) && (sc->inverse)) { if (!sc->readToRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } /*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) { res = appendBarrierVkFFT(sc, 1); sc->readToRegisters = 0; } else sc->readToRegisters = 1;*/ res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; if ((sc->LUT) && (sc->LUT_4step)) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " angle = 2 * %.17e%s * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %.17e%s);\n", 3.1415926535897932384626433832795, LFending, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->readToRegisters) { sc->tempLen = sprintf(sc->tempStr, "\ w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = w.x;\n", sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].x = w.x;\n", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } //appendBarrierVkFFT(sc, 1); break; } } return res; } static inline VkFFTResult appendReorder4StepWrite(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif uint64_t logicalRegistersPerThread = (sc->rader_generator[sc->numStages - 1] > 0) ? sc->min_registers_per_thread : sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; switch (reorderType) { case 1: {//grouped_c2c char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); if ((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } /*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->writeFromRegisters = 0; } else sc->writeFromRegisters = 1;*/ res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; if ((sc->LUT) && (sc->LUT_4step)) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " angle = 2 * %.17e%s * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %.17e%s;\n", 3.1415926535897932384626433832795, LFending, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inverse) { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = -%s(angle);\n", sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(-angle);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->writeFromRegisters) { sc->tempLen = sprintf(sc->tempStr, "\ w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = w.x;\n", sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].x = w.x;\n", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } break; } case 2: {//single_c2c_strided char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); if (!((!sc->reorderFourStep) && (sc->inverse))) { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } /*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->writeFromRegisters = 0; } else sc->writeFromRegisters = 1;*/ res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; if ((sc->LUT) && (sc->LUT_4step)) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " angle = 2 * %.17e%s * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %.17e%s);\n", 3.1415926535897932384626433832795, LFending, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inverse) { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = -%s(angle);\n", sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(-angle);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->writeFromRegisters) { sc->tempLen = sprintf(sc->tempStr, "\ w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = w.x;\n", sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s].x = w.x;\n", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } //appendBarrierVkFFT(sc, 1); break; } } return res; } static inline VkFFTResult appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t strideType, uint64_t pre_or_post_multiplication) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); //char cosDef[20] = "cos"; //char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); //char cosDef[20] = "__cosf"; //char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); //char cosDef[20] = "__cosf"; //char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); //char cosDef[20] = "native_cos"; //char sinDef[20] = "native_sin"; //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); //char cosDef[20] = "cos"; //char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char index_x[2000] = ""; //char index_y[2000] = ""; //char requestBatch[100] = ""; //char separateRegisterStore[100] = ""; char kernelName[100] = ""; sprintf(kernelName, "BluesteinMultiplication"); if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; uint64_t used_registers = 1; switch (strideType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: used_registers = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); break; case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: used_registers = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); break; } for (uint64_t i = 0; i < used_registers; i++) { switch (strideType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { if (sc->localSize[0] * ((1 + i)) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim - i * sc->localSize[0]; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { if (sc->localSize[1] * ((1 + i)) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim - i * sc->localSize[1]; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } } switch (strideType) { case 0: case 2: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { if (sc->fftDim == sc->fft_dim_full) { sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sprintf(index_x, " (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim); sc->tempLen = sprintf(sc->tempStr, " %s = %s;\n", sc->inoutID, index_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); } break; } case 1: case 111: case 121: case 131: case 141: case 143: case 145: { if (sc->fftDim == sc->fft_dim_full) { sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } } if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " w = %s[%s];\n", kernelName, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //uint64_t k = 0; if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) { if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride];\n", sc->regIDs[i], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->inverseBluestein) res = VkMulComplex(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp); else res = VkMulComplexConj(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp); if (res != VKFFT_SUCCESS) return res; if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) { if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) { sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride] = %s;\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } switch (strideType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { if (sc->localSize[0] * ((1 + i)) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { if (sc->localSize[1] * ((1 + i)) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t strided) { VkFFTResult res = VKFFT_SUCCESS; long double double_PI = 3.14159265358979323846264338327950288419716939937510L; char vecType[30]; char LFending[4] = ""; char tempNum[100] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char stageNormalization[50] = ""; uint64_t normalizationValue = 1; if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { if ((sc->performDCT) && (sc->actualInverse)) { if (sc->performDCT == 1) normalizationValue = (sc->sourceFFTSize - 1) * 2; else normalizationValue = sc->sourceFFTSize * 2; } else normalizationValue = sc->sourceFFTSize; } if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { normalizationValue *= sc->fft_dim_full; } if (normalizationValue != 1) { sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); } char convolutionInverse[10] = ""; sc->useCoalescedLUTUploadToSM = 0; if (sc->convolutionStep) { if (stageAngle < 0) sprintf(convolutionInverse, ", 0"); else sprintf(convolutionInverse, ", 1"); } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; //rotate the stage char* gl_LocalInvocationID = (strided) ? sc->gl_LocalInvocationID_y : sc->gl_LocalInvocationID_x; if (stageSize > 1) { uint64_t num_logical_subgroups = (strided) ? sc->localSize[1] : sc->localSize[0]; uint64_t num_logical_groups = (uint64_t)ceil((sc->fftDim) / (double)(num_logical_subgroups)); for (uint64_t t = 0; t < num_logical_groups; t++) { if (((1 + t) * num_logical_subgroups) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim - t * num_logical_subgroups; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", gl_LocalInvocationID, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, gl_LocalInvocationID, t * num_logical_subgroups, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+(%s+ %" PRIu64 ") / %" PRIu64 "];\n\n", sc->w, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+ %" PRIu64 ");\n", sc->sdataID, gl_LocalInvocationID, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->resolveBankConflictFirstStages == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->sdataID, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (((1 + t) * num_logical_subgroups) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; } uint64_t raderTranspose = ((sc->currentRaderContainer->containerFFTNum < 8) || (sc->currentRaderContainer->numStages == 1) || (strided)) ? 0 : 1; // read x0 - to be used in the end { uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[0]; uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; //uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs //uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", sc->sdataID, sc->raderIDx2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->x0[0], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); } // read x0 for x0+x1 - 0-element { uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1]; uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; //uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs //uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ if (%s == 0) {\n", sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", sc->sdataID, sc->raderIDx2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->x0[1], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->currentRaderContainer->numStages == 1) { if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; } uint64_t locStageSize = 1; uint64_t locStageSizeSum = 0; long double locStageAngle = -double_PI; uint64_t shift = 0; for (uint64_t rader_stage = 0; rader_stage < sc->currentRaderContainer->numStages; rader_stage++) { uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage]; uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix uint64_t locFFTDimStride = locFFTDim; if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; //local radix if ((rader_stage == 0) || (!raderTranspose)) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t k = 0; k < sc->registerBoost; k++) { if ((rader_stage == 0) || (!raderTranspose)) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue; if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx, (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize, locStageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", locStageSizeSum + sc->currentRaderContainer->RaderRadixOffsetLUT); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)(locStageAngle), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < locStageRadix; i++) { uint64_t g = sc->currentRaderContainer->generator; if (rader_stage == 0) { if (sc->inline_rader_g_pow == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow_%" PRIu64 "[%s + %" PRIu64 "];\n", sc->sdataID, stageRadix, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else if (sc->inline_rader_g_pow == 2) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow[%s + %" PRIu64 "];\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->currentRaderContainer->raderUintLUToffset); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s= (%s + %" PRIu64 ");\n\ %s=1;\n\ while (%s != 0)\n\ {\n\ %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ %s--;\n\ }\n", sc->inoutID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, sc->sdataID, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 " + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->fftDim / stageRadix, sc->raderIDx2, locFFTDimStride); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s + %" PRIu64 ")*%" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->currentRaderContainer->containerFFTNum, sc->raderIDx2, sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } uint64_t id = j + i * logicalRegistersPerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; if (!strided) { if (sc->resolveBankConflictFirstStages == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[id], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } char** regID = (char**)malloc(sizeof(char*) * locStageRadix); if (regID) { for (uint64_t i = 0; i < locStageRadix; i++) { regID[i] = (char*)malloc(sizeof(char) * 50); if (!regID[i]) { for (uint64_t p = 0; p < i; p++) { free(regID[p]); regID[p] = 0; } free(regID); regID = 0; return VKFFT_ERROR_MALLOC_FAILED; } uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(regID[i], "%s", sc->regIDs[id]); } res = inlineRadixKernelVkFFT(sc, floatType, uintType, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < locStageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(sc->regIDs[id], "%s", regID[i]); } for (uint64_t i = 0; i < locStageRadix; i++) { free(regID[i]); regID[i] = 0; } free(regID); regID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (rader_stage != sc->currentRaderContainer->numStages - 1) { res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; } //local shuffle char** tempID; tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); if (tempID) { for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { tempID[i] = (char*)malloc(sizeof(char) * 50); if (!tempID[i]) { for (uint64_t j = 0; j < i; j++) { free(tempID[j]); tempID[j] = 0; } free(tempID); tempID = 0; return VKFFT_ERROR_MALLOC_FAILED; } } for (uint64_t k = 0; k < sc->registerBoost; ++k) { uint64_t t = 0; if ((rader_stage == 0) || (!raderTranspose)) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //last stage - save x1 if (rader_stage == sc->currentRaderContainer->numStages - 1) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s == 0) {\n", sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->x0[1], sc->x0[1], sc->regIDs[0]); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strided) { if (rader_stage != 0) { shift = (subLogicalGroupSize > (locFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - locFFTDim % (sc->numSharedBanks / 2) : 0; if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; } else { if (sc->sharedShiftRaderFFT > 0) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sharedStride = %" PRIu64 ";\n", sc->sharedStrideRaderFFT); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if ((rader_stage == 0) || (!raderTranspose)) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } shift = ((locFFTDim % (sc->numSharedBanks / 2))) ? 0 : 1; if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; } } for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(tempNum, "%" PRIu64 "", j * subLogicalGroupSize); res = VkAddReal(sc, sc->stageInvocationID, sc->raderIDx, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", locStageSize); res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", locStageRadix); res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\ blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\ inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ for (uint64_t i = 0; i < locStageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); t++; if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { sprintf(tempNum, "%" PRIu64 "", i * locStageSize); res = VkAddReal(sc, sc->combinedID, sc->inoutID, tempNum); if (res != VKFFT_SUCCESS) return res; //last stage - mult rader kernel if (rader_stage == sc->currentRaderContainer->numStages - 1) { if (sc->inline_rader_kernel) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = r_rader_kernel_%" PRIu64 "[%s];\n\ %s.y = i_rader_kernel_%" PRIu64 "[%s];\n", sc->w, stageRadix, sc->combinedID, sc->w, stageRadix, sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = twiddleLUT[%s+%" PRIu64 "];\n", sc->w, sc->combinedID, sc->currentRaderContainer->RaderKernelOffsetLUT); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ printf(\"%%f %%f - %%f %%f\\n\", %s.x, %s.y, %s.x, %s.y);\n", sc->regIDs[id], sc->regIDs[id], sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ res = VkMulComplex(sc, sc->regIDs[id], sc->regIDs[id], sc->w, sc->temp); if (res != VKFFT_SUCCESS) return res; } if (rader_stage != sc->currentRaderContainer->numStages - 1) { if (!raderTranspose) { sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); res = VkAddReal(sc, sc->sdataID, sc->combinedID, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%s * %" PRIu64 "", sc->raderIDx2, locFFTDimStride); res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; } else { sprintf(tempNum, "%" PRIu64 "", sc->currentRaderContainer->containerFFTNum); res = VkMulReal(sc, sc->sdataID, sc->combinedID, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%s", sc->raderIDx2); res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; } if (!strided) { if (0 && (locStageSize <= sc->numSharedBanks / 2) && (locFFTsCombined > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != locFFTDim / sc->registerBoost) && ((locFFTDim & (locFFTDim - 1)) == 0) && (locStageSize * locStageRadix != locFFTDim)) { if (sc->resolveBankConflictFirstStages == 0) { sc->resolveBankConflictFirstStages = 1; sc->tempLen = sprintf(sc->tempStr, "\ %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->resolveBankConflictFirstStages == 1) { sc->resolveBankConflictFirstStages = 0; sc->tempLen = sprintf(sc->tempStr, "\ %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (strided) { res = VkMulReal(sc, sc->sdataID, sc->sdataID, sc->sharedStride); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); if (res != VKFFT_SUCCESS) return res; } } //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); if (res != VKFFT_SUCCESS) return res; } } /*sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ } if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); t++; } t = 0; } if (rader_stage != sc->currentRaderContainer->numStages - 1) { for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { //printf("0 - %s\n", resID[i]); sprintf(sc->regIDs[i], "%s", tempID[i]); //sprintf(resID[i], "%s", tempID[i]); //printf("1 - %s\n", resID[i]); } } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { free(tempID[i]); tempID[i] = 0; } free(tempID); tempID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; if (rader_stage > 0) { switch (locStageRadix) { case 2: locStageSizeSum += locStageSize; break; case 3: locStageSizeSum += locStageSize * 2; break; case 4: locStageSizeSum += locStageSize * 2; break; case 5: locStageSizeSum += locStageSize * 4; break; case 6: locStageSizeSum += locStageSize * 5; break; case 7: locStageSizeSum += locStageSize * 6; break; case 8: locStageSizeSum += locStageSize * 3; break; case 9: locStageSizeSum += locStageSize * 8; break; case 10: locStageSizeSum += locStageSize * 9; break; case 11: locStageSizeSum += locStageSize * 10; break; case 12: locStageSizeSum += locStageSize * 11; break; case 13: locStageSizeSum += locStageSize * 12; break; case 14: locStageSizeSum += locStageSize * 13; break; case 15: locStageSizeSum += locStageSize * 14; break; case 16: locStageSizeSum += locStageSize * 4; break; case 32: locStageSizeSum += locStageSize * 5; break; default: locStageSizeSum += locStageSize * (locStageRadix); break; } } locStageSize *= locStageRadix; locStageAngle /= locStageRadix; if (rader_stage != sc->currentRaderContainer->numStages - 1) { res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; } } //iFFT locStageSize = 1; locStageAngle = double_PI; locStageSizeSum = 0; for (int64_t rader_stage = sc->currentRaderContainer->numStages - 1; rader_stage >= 0; rader_stage--) { uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage]; uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix uint64_t locFFTDimStride = locFFTDim; //different length due to all -1 cutoffs if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; //local radix if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t k = 0; k < sc->registerBoost; k++) { if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue; if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx, (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize, locStageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", locStageSizeSum + sc->currentRaderContainer->RaderRadixOffsetLUTiFFT); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)(locStageAngle), LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) { for (uint64_t i = 0; i < locStageRadix; i++) { uint64_t id = j + i * logicalRegistersPerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s + %" PRIu64 ") + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->fftDim / stageRadix, sc->raderIDx2, locFFTDimStride); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s + %" PRIu64 ")*%" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->currentRaderContainer->containerFFTNum, sc->raderIDx2, sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strided) { if (sc->resolveBankConflictFirstStages == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[id], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } char** regID = (char**)malloc(sizeof(char*) * locStageRadix); if (regID) { for (uint64_t i = 0; i < locStageRadix; i++) { regID[i] = (char*)malloc(sizeof(char) * 50); if (!regID[i]) { for (uint64_t p = 0; p < i; p++) { free(regID[p]); regID[p] = 0; } free(regID); regID = 0; return VKFFT_ERROR_MALLOC_FAILED; } uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(regID[i], "%s", sc->regIDs[id]); } res = inlineRadixKernelVkFFT(sc, floatType, uintType, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < locStageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(sc->regIDs[id], "%s", regID[i]); } for (uint64_t i = 0; i < locStageRadix; i++) { free(regID[i]); regID[i] = 0; } free(regID); regID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; if (!strided) { if (rader_stage == 0) { if (sc->sharedStrideRaderFFT > 0) { sc->tempLen = sprintf(sc->tempStr, "\ sharedStride = %" PRIu64 ";\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; //local shuffle char** tempID; tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); if (tempID) { for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { tempID[i] = (char*)malloc(sizeof(char) * 50); if (!tempID[i]) { for (uint64_t j = 0; j < i; j++) { free(tempID[j]); tempID[j] = 0; } free(tempID); tempID = 0; return VKFFT_ERROR_MALLOC_FAILED; } } for (uint64_t k = 0; k < sc->registerBoost; ++k) { uint64_t t = 0; if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (rader_stage == 0) { res = VkMovReal(sc, sc->stageInvocationID, sc->raderIDx2); if (res != VKFFT_SUCCESS) return res; res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageSize); res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageRadix); res = VkMulReal(sc, sc->raderIDx2, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->raderIDx2, sc->raderIDx2, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; } if (!strided) { if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) { shift = (subLogicalGroupSize > (locFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - locFFTDim % (sc->numSharedBanks / 2) : 0; if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; } else { shift = ((locFFTDim % (sc->numSharedBanks / 2))) ? 0 : 1; if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; } } for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(tempNum, "%" PRIu64 "", j * subLogicalGroupSize); res = VkAddReal(sc, sc->stageInvocationID, sc->raderIDx, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", locStageSize); res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", locStageRadix); res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\ blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\ inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ for (uint64_t i = 0; i < locStageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); t++; if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { sprintf(tempNum, "%" PRIu64 "", i * locStageSize); res = VkAddReal(sc, sc->combinedID, sc->inoutID, tempNum); if (res != VKFFT_SUCCESS) return res; if (rader_stage == 0) { locFFTDimStride = locFFTDim; //last stage - add x0 uint64_t g = sc->currentRaderContainer->generator; if (sc->inline_rader_g_pow == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->combinedID, stageRadix, stageRadix - 1, sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else if (sc->inline_rader_g_pow == 2) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow[%" PRIu64 "-%s];\n", sc->combinedID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s= (%" PRIu64 "-%s);\n\ %s=1;\n\ while (%s != 0)\n\ {\n\ %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ %s--;\n\ }\n", sc->inoutID, stageRadix - 1, sc->combinedID, sc->sdataID, sc->inoutID, sc->combinedID, sc->combinedID, g, stageRadix, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inverse) { sprintf(tempNum, "(%" PRIu64 "-%s)*%" PRIu64 "", (stageRadix), sc->combinedID, stageSize); } else { sprintf(tempNum, "%s*%" PRIu64 "", sc->combinedID, stageSize); } res = VkAddReal(sc, sc->sdataID, sc->raderIDx2, tempNum); if (res != VKFFT_SUCCESS) return res; //normalization is in kernel /*sprintf(tempNum, "%.17e%s", 1.0 / locFFTDim, LFending); res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], tempNum); if (res != VKFFT_SUCCESS) return res;*/ res = VkAddComplex(sc, sc->regIDs[id], sc->regIDs[id], sc->x0[0]); if (res != VKFFT_SUCCESS) return res; } else { if (!raderTranspose) { sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); res = VkAddReal(sc, sc->sdataID, sc->combinedID, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%s * %" PRIu64 "", sc->raderIDx2, locFFTDimStride); res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; } else { sprintf(tempNum, "%" PRIu64 "", sc->currentRaderContainer->containerFFTNum); res = VkMulReal(sc, sc->sdataID, sc->combinedID, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%s", sc->raderIDx2); res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; } } if (!strided) { if (0 && (locStageSize <= sc->numSharedBanks / 2) && (locFFTsCombined > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != locFFTDim / sc->registerBoost) && ((locFFTDim & (locFFTDim - 1)) == 0) && (locStageSize * locStageRadix != locFFTDim)) { if (sc->resolveBankConflictFirstStages == 0) { sc->resolveBankConflictFirstStages = 1; sc->tempLen = sprintf(sc->tempStr, "\ %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->resolveBankConflictFirstStages == 1) { sc->resolveBankConflictFirstStages = 0; sc->tempLen = sprintf(sc->tempStr, "\ %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (strided) { res = VkMulReal(sc, sc->sdataID, sc->sdataID, sc->sharedStride); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); if (res != VKFFT_SUCCESS) return res; } } //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); if ((((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) && (rader_stage == 0)) { if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); } if (res != VKFFT_SUCCESS) return res; } res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->sdataID, sc->regIDs[id], sc->regIDs[id]); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ } if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); t++; } t = 0; } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { //printf("0 - %s\n", resID[i]); sprintf(sc->regIDs[i], "%s", tempID[i]); //sprintf(resID[i], "%s", tempID[i]); //printf("1 - %s\n", resID[i]); } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { free(tempID[i]); tempID[i] = 0; } free(tempID); tempID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; if (rader_stage < (int64_t)sc->currentRaderContainer->numStages - 1) { switch (locStageRadix) { case 2: locStageSizeSum += locStageSize; break; case 3: locStageSizeSum += locStageSize * 2; break; case 4: locStageSizeSum += locStageSize * 2; break; case 5: locStageSizeSum += locStageSize * 4; break; case 6: locStageSizeSum += locStageSize * 5; break; case 7: locStageSizeSum += locStageSize * 6; break; case 8: locStageSizeSum += locStageSize * 3; break; case 9: locStageSizeSum += locStageSize * 8; break; case 10: locStageSizeSum += locStageSize * 9; break; case 11: locStageSizeSum += locStageSize * 10; break; case 12: locStageSizeSum += locStageSize * 11; break; case 13: locStageSizeSum += locStageSize * 12; break; case 14: locStageSizeSum += locStageSize * 13; break; case 15: locStageSizeSum += locStageSize * 14; break; case 16: locStageSizeSum += locStageSize * 4; break; case 32: locStageSizeSum += locStageSize * 5; break; default: locStageSizeSum += locStageSize * (locStageRadix); break; } } locStageSize *= locStageRadix; locStageAngle /= locStageRadix; res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; } { uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1]; uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; //uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs //uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!raderTranspose) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ if (%s == 0) {\n", sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMovReal(sc, sc->stageInvocationID, sc->raderIDx2); if (res != VKFFT_SUCCESS) return res; res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageSize); res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageRadix); res = VkMulReal(sc, sc->raderIDx2, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->raderIDx2, sc->raderIDx2, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", sc->sdataID, sc->raderIDx2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->x0[1], sc->x0[1], stageNormalization); } if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->sdataID, sc->x0[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } return res; } static inline VkFFTResult appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t strided) { VkFFTResult res = VKFFT_SUCCESS; long double double_PI = 3.14159265358979323846264338327950288419716939937510L; char vecType[30]; char LFending[4] = ""; char tempNum[50] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char stageNormalization[50] = ""; uint64_t normalizationValue = 1; if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { if ((sc->performDCT) && (sc->actualInverse)) { if (sc->performDCT == 1) normalizationValue = (sc->sourceFFTSize - 1) * 2; else normalizationValue = sc->sourceFFTSize * 2; } else normalizationValue = sc->sourceFFTSize; } if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { normalizationValue *= sc->fft_dim_full; } if (normalizationValue != 1) { sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); } char convolutionInverse[10] = ""; if (sc->convolutionStep) { if (stageAngle < 0) sprintf(convolutionInverse, ", 0"); else sprintf(convolutionInverse, ", 1"); } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; uint64_t num_logical_subgroups = (strided) ? sc->localSize[1] / ((stageRadix + 1) / 2) : sc->localSize[0] / ((stageRadix + 1) / 2); uint64_t num_logical_groups = (uint64_t)ceil((sc->fftDim / stageRadix) / (double)(num_logical_subgroups)); uint64_t require_cutoff_check = ((sc->fftDim == (num_logical_subgroups * num_logical_groups * stageRadix))) ? 0 : 1; uint64_t require_cutoff_check2; char* gl_LocalInvocationID = (strided) ? sc->gl_LocalInvocationID_y : sc->gl_LocalInvocationID_x; if (strided) { require_cutoff_check2 = ((sc->localSize[1] % ((stageRadix + 1) / 2)) == 0) ? 0 : 1; } else { require_cutoff_check2 = ((sc->localSize[0] % ((stageRadix + 1) / 2)) == 0) ? 0 : 1; } sc->tempLen = sprintf(sc->tempStr, " %s= %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, (stageRadix + 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s= %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, (stageRadix + 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t j = 0; j < 1; j++) { if (stageSize > 1) { if (require_cutoff_check2) { if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx2, t * num_logical_subgroups, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%s];\n\n", sc->w, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s*(%s));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s*(%s));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s*(%s));\n", sc->w, 2.0 / stageRadix, LFending, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->sdataID, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx2, t * num_logical_subgroups, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%s+%" PRIu64 "];\n\n", sc->w, sc->raderIDx, (stageRadix + 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = (%" PRIu64 " + %s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, (stageRadix + 1) / 2, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->sdataID, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (require_cutoff_check2) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; } if (require_cutoff_check2) { if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //save x0 for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (strided) { if (sc->localSize[0] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s + %" PRIu64 ") * sharedStride + %s;\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 " + sharedStride * %s;\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->x0[t], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //generator index + shuffle sc->tempLen = sprintf(sc->tempStr, "\ if(%s>0){\n", sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; uint64_t g = sc->currentRaderContainer->generator; if (sc->inline_rader_g_pow == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow_%" PRIu64 "[%s-1];\n", sc->sdataID, stageRadix, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else if (sc->inline_rader_g_pow == 2) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow[%s-1+%" PRIu64 "];\n", sc->sdataID, sc->raderIDx, sc->currentRaderContainer->raderUintLUToffset); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s= (%s-1);\n\ %s=1;\n\ while (%s != 0)\n\ {\n\ %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ %s--;\n\ }\n", sc->inoutID, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[t * 2], sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->inline_rader_g_pow == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow_%" PRIu64 "[%s+ %" PRIu64 "];\n", sc->sdataID, stageRadix, sc->raderIDx, (stageRadix - 1) / 2 - 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else if (sc->inline_rader_g_pow == 2) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow[%s+ %" PRIu64 "];\n", sc->sdataID, sc->raderIDx, (stageRadix - 1) / 2 - 1 + sc->currentRaderContainer->raderUintLUToffset); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s= (%s+ %" PRIu64 ");\n\ %s=1;\n\ while (%s != 0)\n\ {\n\ %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ %s--;\n\ }\n", sc->inoutID, sc->raderIDx, (stageRadix - 1) / 2 - 1, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[2 * t + 1], sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //load deconv kernel if (!sc->inline_rader_kernel) { for (uint64_t t = 0; t < (uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))); t++) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s * %" PRIu64 " + %" PRIu64 ";\n", sc->combinedID, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->localSize[0], t * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (t == ((uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))) - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s < %" PRIu64 "){\n", sc->combinedID, (stageRadix - 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, "\ %s = twiddleLUT[%s+%" PRIu64 "];\n", sc->w, sc->combinedID, sc->currentRaderContainer->RaderKernelOffsetLUT); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s+%" PRIu64 "] = %s;\n", sc->combinedID, sc->RaderKernelOffsetShared[stageID], sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->inline_rader_g_pow == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow_%" PRIu64 "[%" PRIu64 " - %s];\n", sc->sdataID, stageRadix, stageRadix - 1, sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else if (sc->inline_rader_g_pow == 2) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow[%" PRIu64 " - %s];\n", sc->sdataID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s= (%" PRIu64 " - %s);\n\ %s=1;\n\ while (%s != 0)\n\ {\n\ %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ %s--;\n\ }\n", sc->inoutID, stageRadix - 1, sc->combinedID, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(%.17e%s*%s);\n", sc->w, cosDef, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(%.17e%s*%s);\n", sc->w, sinDef, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s(%.17e%s*%s);\n", sc->w, sinDef, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(%.17e%s*%s);\n", sc->w, (double)(2.0 * double_PI / stageRadix), LFending, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s+%" PRIu64 "] = %s;\n", sc->combinedID, sc->RaderKernelOffsetShared[stageID], sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (t == ((uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))) - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //x0 is ready //no subgroups /* { sc->tempLen = sprintf(sc->tempStr, "\ if(%s==0){\n", sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = 0;\n\ %s.y = 0;\n", sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = 0;\n", sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * (%s);\n", sc->sdataID, sc->combinedID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ while(%s<%" PRIu64 "){\n\ %s.x += sdata[%s].x;\n\ %s.y += sdata[%s].y;\n\ %s++; %s++;}\n", sc->combinedID, stageRadix, sc->regIDs[0], sc->sdataID, sc->regIDs[0], sc->sdataID, sc->combinedID, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ while(%s<%" PRIu64 "){\n\ %s.x += sdata[%s].x;\n\ %s.y += sdata[%s].y;\n\ %s++;}\n", sc->combinedID, stageRadix, sc->regIDs[0], sc->combinedID, sc->regIDs[0], sc->combinedID, sc->combinedID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = 0;\n", sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * (%s);\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; }*/ //subgroups /* { uint64_t numGroupsQuant = ((((sc->localSize[0] * sc->localSize[1] * sc->localSize[2]) % sc->warpSize) == 0) || (sc->numSubgroups == 1)) ? sc->numSubgroups : sc->numSubgroups - 1; if (numGroupsQuant != sc->numSubgroups) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, numGroupsQuant); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t t = 0; t < (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant); t++) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = 0;\n", sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; uint64_t quant = (sc->warpSize < (sc->localSize[0] * sc->localSize[1] * sc->localSize[2])) ? sc->warpSize : (sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); for (uint64_t t2 = 0; t2 < (uint64_t)ceil(stageRadix / (double)quant); t2++) { if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_SubgroupInvocationID, stageRadix % quant); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+%" PRIu64 ") * %" PRIu64 ";\n", sc->sdataID, sc->gl_SubgroupInvocationID, t2 * quant, sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * (%s+%" PRIu64 ");\n", sc->sdataID, sc->sdataID, sc->gl_SubgroupID, t * numGroupsQuant); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[1], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); if (res != VKFFT_SUCCESS) return res; if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkSubgroupAdd(sc, sc->regIDs[0], sc->regIDs[0], 1); if (res != VKFFT_SUCCESS) return res; if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ if(%s==0){\n", sc->gl_SubgroupInvocationID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = 0;\n", sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * (%s+%" PRIu64 ");\n", sc->sdataID, sc->sdataID, sc->gl_SubgroupID, t * numGroupsQuant); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (numGroupsQuant != sc->numSubgroups) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } }*/ sc->tempLen = sprintf(sc->tempStr, "\ if(%s > 0){\n", sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 " * sharedStride;\n", sc->combinedID, sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x - %s.x;\n", sc->temp, sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x += %s.x;\n", sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = %s.y + %s.y;\n", sc->temp, sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y -= %s.y;\n", sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[2 * t]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = %s;\n", sc->combinedID, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->regIDs[0], sc->regIDs[0], sc->temp, sc->temp); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t t = 0; t < num_logical_groups; t++) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = 0;\n", sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.y = 0;\n", sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ if(%s == %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = 1; %s.y = 0;\n", sc->w, sc->w); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < (stageRadix - 1) / 2; i++) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = ((%" PRIu64 "+%s) %% %" PRIu64 ");\n", sc->sdataID, stageRadix - 1 - i, sc->raderIDx, (stageRadix - 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inline_rader_kernel) { sc->tempLen = sprintf(sc->tempStr, "\ %s.x = r_rader_kernel_%" PRIu64 "[%s];\n\ %s.y = i_rader_kernel_%" PRIu64 "[%s];\n", sc->w, stageRadix, sc->sdataID, sc->w, stageRadix, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s+%" PRIu64 "];\n", sc->w, sc->sdataID, sc->RaderKernelOffsetShared[stageID]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t t = 0; t < num_logical_groups; t++) { #if(VKFFT_BACKEND != 2) //AMD compiler fix if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif sc->tempLen = sprintf(sc->tempStr, "\ %s = %s+ %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups + (1 + i) * sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s += %" PRIu64 "*sharedStride;\n", sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s += %" PRIu64 " ;\n", sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->temp, sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #if(VKFFT_BACKEND == 2) //AMD compiler fix if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s>=%" PRIu64 "){%s.x =0;%s.y=0;%s.x=0;%s.y=0;}\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups, sc->temp, sc->temp, sc->regIDs[0], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif sprintf(tempNum, "%s", sc->x0[t]); res = VkFMA3Complex(sc, tempNum, sc->regIDs[2 * t + 1], sc->regIDs[0], sc->w, sc->temp, sc->temp2); if (res != VKFFT_SUCCESS) return res; #if(VKFFT_BACKEND != 2) //AMD compiler fix if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif #if(VKFFT_BACKEND == 2) //AMD compiler fix if ((uint64_t)ceil((sc->localSize[0] * sc->localSize[1]) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) > 2048) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif } #if(VKFFT_BACKEND == 2) //AMD compiler fix if ((uint64_t)ceil((sc->localSize[0] * sc->localSize[1]) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) <= 2048) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } #endif } for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(tempNum, "%s", sc->x0[t]); sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x-%s.x;\n\ %s.y = %s.y+%s.y;\n", sc->regIDs[2 * t], tempNum, sc->regIDs[2 * t + 1], sc->regIDs[2 * t], tempNum, sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s.x = %s.x+%s.x;\n\ %s.y = %s.y-%s.y;\n", sc->regIDs[2 * t + 1], tempNum, sc->regIDs[2 * t + 1], sc->regIDs[2 * t + 1], tempNum, sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->regIDs[1], sc->regIDs[1]); //res = VkAppendLine(sc); //if (res != VKFFT_SUCCESS) return res; if (sc->inline_rader_g_pow == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix, stageRadix - 1, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else if (sc->inline_rader_g_pow == 2) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s= (%" PRIu64 "-%s);\n\ %s=1;\n\ while (%s != 0)\n\ {\n\ %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ %s--;\n\ }\n", sc->inoutID, stageRadix - 1, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = 0;\n", sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(tempNum, "%" PRIu64 "", t * num_logical_subgroups); res = VkAddReal(sc, sc->combinedID, sc->raderIDx2, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageSize); res = VkModReal(sc, sc->stageInvocationID, sc->combinedID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->combinedID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageRadix); res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s * %" PRIu64 " + %s;\n", sc->combinedID, sc->inoutID, sc->sdataID, stageSize, sc->stageInvocationID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[2 * t], sc->regIDs[2 * t], stageNormalization); } if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s]=%s;\n", sc->combinedID, sc->regIDs[2 * t]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inline_rader_g_pow == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix, (stageRadix - 1) / 2, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else if (sc->inline_rader_g_pow == 2) { sc->tempLen = sprintf(sc->tempStr, "\ %s= g_pow[%" PRIu64 "-%s];\n", sc->sdataID, (stageRadix - 1) / 2 + sc->currentRaderContainer->raderUintLUToffset, sc->raderIDx); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ %s= (%" PRIu64 "-%s);\n\ %s=1;\n\ while (%s != 0)\n\ {\n\ %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ %s--;\n\ }\n", sc->inoutID, (stageRadix - 1) / 2, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t t = 0; t < num_logical_groups; t++) { if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(tempNum, "%" PRIu64 "", t * num_logical_subgroups); res = VkAddReal(sc, sc->combinedID, sc->raderIDx2, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageSize); res = VkModReal(sc, sc->stageInvocationID, sc->combinedID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->combinedID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageRadix); res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %s * %" PRIu64 " + %s;\n", sc->combinedID, sc->inoutID, sc->sdataID, stageSize, sc->stageInvocationID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (strided) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[2 * t + 1], sc->regIDs[2 * t + 1], stageNormalization); } if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s]=%s;\n", sc->combinedID, sc->regIDs[2 * t + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((require_cutoff_check) && (t == num_logical_groups - 1)) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (require_cutoff_check2) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } } return res; } static inline VkFFTResult appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char convolutionInverse[10] = ""; if (sc->convolutionStep) { if (stageAngle < 0) sprintf(convolutionInverse, ", 0"); else sprintf(convolutionInverse, ", 1"); } uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT))) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; //upload second stage of LUT to sm uint64_t numLUTelementsStage = 0; switch (stageRadix) { case 2: numLUTelementsStage = 1; break; case 4: numLUTelementsStage = 2; break; case 8: numLUTelementsStage = 3; break; case 16: numLUTelementsStage = 4; break; case 32: numLUTelementsStage = 5; break; default: if (stageRadix < sc->fixMinRaderPrimeMult) numLUTelementsStage = stageRadix - 1; else numLUTelementsStage = stageRadix; break; } if ((sc->LUT) && (stageSize > 1) && ((((numLUTelementsStage >= 4) && (sc->fftDim >= 1024)) || (((numLUTelementsStage >= 3) && (sc->fftDim < 1024)))) || (logicalRegistersPerThread / stageRadix > 1)) && (sc->registerBoost == 1) && (stageSize < sc->warpSize)) sc->useCoalescedLUTUploadToSM = 1; else sc->useCoalescedLUTUploadToSM = 0; for (uint64_t k = 0; k < sc->registerBoost; k++) { if (logicalGroupSize != sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_x, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) { //if(sc->readToRegisters==0){ for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + i * logicalRegistersPerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->gl_LocalInvocationID_x, j * logicalGroupSize + i * sc->fftDim / stageRadix); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->resolveBankConflictFirstStages == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s];\n", sc->regIDs[id], sc->sdataID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (!sc->useCoalescedLUTUploadToSM) { char** regID = (char**)malloc(sizeof(char*) * stageRadix); if (regID) { for (uint64_t i = 0; i < stageRadix; i++) { regID[i] = (char*)malloc(sizeof(char) * 50); if (!regID[i]) { for (uint64_t p = 0; p < i; p++) { free(regID[p]); regID[p] = 0; } free(regID); regID = 0; return VKFFT_ERROR_MALLOC_FAILED; } uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(regID[i], "%s", sc->regIDs[id]); /*if(j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) sprintf(regID[i], "%s", sc->regIDs[j + i * logicalStoragePerThread / stageRadix]); else sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix)/ logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ } res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(sc->regIDs[id], "%s", regID[i]); } for (uint64_t i = 0; i < stageRadix; i++) { free(regID[i]); regID[i] = 0; } free(regID); regID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; } if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (logicalGroupSize != sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->useCoalescedLUTUploadToSM) { res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->useCoalescedLUTUploadToSM = 1; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 "*%s;\n", sc->sdataID, sc->sdataID, sc->localSize[0], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])); i++) { if (i > 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->sdataID, sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->sdataID, numLUTelementsStage * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = twiddleLUT[%s+%" PRIu64 "];\n", sc->sdataID, sc->sdataID, (stageSizeSum)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (logicalGroupSize != sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } char** regID = (char**)malloc(sizeof(char*) * stageRadix); if (regID) { for (uint64_t i = 0; i < stageRadix; i++) { regID[i] = (char*)malloc(sizeof(char) * 50); if (!regID[i]) { for (uint64_t p = 0; p < i; p++) { free(regID[p]); regID[p] = 0; } free(regID); regID = 0; return VKFFT_ERROR_MALLOC_FAILED; } uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(regID[i], "%s", sc->regIDs[id]); /*if(j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) sprintf(regID[i], "%s", sc->regIDs[j + i * logicalStoragePerThread / stageRadix]); else sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix)/ logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_x, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->useCoalescedLUTUploadToSM) { if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(sc->regIDs[id], "%s", regID[i]); } for (uint64_t i = 0; i < stageRadix; i++) { free(regID[i]); regID[i] = 0; } free(regID); regID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (logicalGroupSize != sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((stageSize == 1) && (sc->cacheShuffle)) { for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { uint64_t id = i + k * logicalRegistersPerThread; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sc->tempLen = sprintf(sc->tempStr, "\ shuffle[%" PRIu64 "]=%s;\n", i, sc->regIDs[id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { uint64_t id = i + k * logicalRegistersPerThread; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sc->tempLen = sprintf(sc->tempStr, "\ %s=shuffle[(%" PRIu64 "+tshuffle)%%(%" PRIu64 ")];\n", sc->regIDs[id], i, logicalRegistersPerThread); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult appendRadixStageStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char convolutionInverse[10] = ""; if (sc->convolutionStep) { if (stageAngle < 0) sprintf(convolutionInverse, ", 0"); else sprintf(convolutionInverse, ", 1"); } uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT))) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; //upload second stage of LUT to sm uint64_t numLUTelementsStage = 0; switch (stageRadix) { case 2: numLUTelementsStage = 1; break; case 4: numLUTelementsStage = 2; break; case 8: numLUTelementsStage = 3; break; case 16: numLUTelementsStage = 4; break; case 32: numLUTelementsStage = 5; break; default: if (stageRadix < sc->fixMinRaderPrimeMult) numLUTelementsStage = stageRadix - 1; else numLUTelementsStage = stageRadix; break; } if ((sc->LUT) && (stageSize > 1) && ((((numLUTelementsStage >= 4) && (sc->fftDim >= 1024)) || (((numLUTelementsStage >= 3) && (sc->fftDim < 1024)))) || (logicalRegistersPerThread / stageRadix > 1)) && (sc->registerBoost == 1) && (stageSize < sc->warpSize)) sc->useCoalescedLUTUploadToSM = 1; else sc->useCoalescedLUTUploadToSM = 0; for (uint64_t k = 0; k < sc->registerBoost; k++) { if (logicalGroupSize != sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_y, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) { for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + i * logicalRegistersPerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s*(%s+%" PRIu64 ")+%s];\n", sc->regIDs[id], sc->sharedStride, sc->gl_LocalInvocationID_y, j * logicalGroupSize + i * sc->fftDim / stageRadix, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (!sc->useCoalescedLUTUploadToSM) { char** regID = (char**)malloc(sizeof(char*) * stageRadix); if (regID) { for (uint64_t i = 0; i < stageRadix; i++) { regID[i] = (char*)malloc(sizeof(char) * 50); if (!regID[i]) { for (uint64_t p = 0; p < i; p++) { free(regID[p]); regID[p] = 0; } free(regID); regID = 0; return VKFFT_ERROR_MALLOC_FAILED; } uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(regID[i], "%s", sc->regIDs[id]); /*if (j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) sprintf(regID[i], "_%" PRIu64 "", j + i * logicalStoragePerThread / stageRadix); else sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix) / logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ } res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(sc->regIDs[id], "%s", regID[i]); } for (uint64_t i = 0; i < stageRadix; i++) { free(regID[i]); regID[i] = 0; } free(regID); regID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; } if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (logicalGroupSize != sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //upload second stage of LUT to sm if (sc->useCoalescedLUTUploadToSM) { res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; sc->useCoalescedLUTUploadToSM = 1; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s;\n", sc->sdataID, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 "*%s;\n", sc->sdataID, sc->sdataID, sc->localSize[0], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])); i++) { if (i > 0) { sc->tempLen = sprintf(sc->tempStr, "\ %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->sdataID, sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { sc->tempLen = sprintf(sc->tempStr, "\ if(%s<%" PRIu64 "){\n", sc->sdataID, numLUTelementsStage * stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s] = twiddleLUT[%s+%" PRIu64 "];\n", sc->sdataID, sc->sdataID, (stageSizeSum)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (logicalGroupSize != sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } char** regID = (char**)malloc(sizeof(char*) * stageRadix); if (regID) { for (uint64_t i = 0; i < stageRadix; i++) { regID[i] = (char*)malloc(sizeof(char) * 50); if (!regID[i]) { for (uint64_t p = 0; p < i; p++) { free(regID[p]); regID[p] = 0; } free(regID); regID = 0; return VKFFT_ERROR_MALLOC_FAILED; } uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(regID[i], "%s", sc->regIDs[id]); /*if (j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) sprintf(regID[i], "_%" PRIu64 "", j + i * logicalStoragePerThread / stageRadix); else sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix) / logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_y, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); else sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", (double)stageAngle, LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(sc->regIDs[id], "%s", regID[i]); } for (uint64_t i = 0; i < stageRadix; i++) { free(regID[i]); regID[i] = 0; } free(regID); regID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (logicalGroupSize != sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; if (stageSize == 1) { sc->tempLen = sprintf(sc->tempStr, " %s = %" PRIu64 ";\n", sc->sharedStride, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } return res; } static inline VkFFTResult appendRadixStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t shuffleType) { VkFFTResult res = VKFFT_SUCCESS; if (sc->rader_generator[stageID]) { for (uint64_t i = 0; i < sc->numRaderPrimes; i++) { if (sc->raderContainer[i].prime == stageRadix) { sc->currentRaderContainer = &sc->raderContainer[i]; } } if (sc->currentRaderContainer->type) { switch (shuffleType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { res = appendMultRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 0); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { res = appendMultRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 1); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } } } else { switch (shuffleType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { res = appendFFTRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 0); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { res = appendFFTRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 1); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } } } } else { switch (shuffleType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { res = appendRadixStageNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { res = appendRadixStageStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } } } return res; } static inline VkFFTResult appendRegisterBoostShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, uint64_t stageSize, uint64_t stageRadixPrev, uint64_t stageRadix, long double stageAngle) { VkFFTResult res = VKFFT_SUCCESS; /*if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { uint64_t bluesteinInverseNormalize = 1; if ((sc->useBluesteinFFT) && (stageAngle > 0) && (stageSize == 1) && (sc->normalize) && (sc->axis_upload_id == 0)) bluesteinInverseNormalize = sc->bluesteinNormalizeSize; char stageNormalization[50] = ""; if ((stageSize == 1) && (sc->performDCT) && (sc->actualInverse)) { if (sc->performDCT == 4) sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 4 * bluesteinInverseNormalize); else sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 2 * bluesteinInverseNormalize); } else sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * bluesteinInverseNormalize); uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; for (uint64_t k = 0; k < sc->registerBoost; ++k) { for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { res = VkDivComplexNumber(sc, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], stageNormalization); if (res != VKFFT_SUCCESS) return res; } } }*/ return res; } static inline VkFFTResult appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char stageNormalization[50] = ""; uint64_t normalizationValue = 1; if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { if ((sc->performDCT) && (sc->actualInverse)) { if (sc->performDCT == 1) normalizationValue = (sc->sourceFFTSize - 1) * 2; else normalizationValue = sc->sourceFFTSize * 2; } else normalizationValue = sc->sourceFFTSize; } if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { normalizationValue *= sc->fft_dim_full; } if (normalizationValue != 1) { sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); } char tempNum[50] = ""; uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); uint64_t logicalGroupSizeNext = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThreadNext); if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)))) || (sc->performDCT))) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //if ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT)) { if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT))) { if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) { char** tempID; tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); if (tempID) { for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { tempID[i] = (char*)malloc(sizeof(char) * 50); if (!tempID[i]) { for (uint64_t j = 0; j < i; j++) { free(tempID[j]); tempID[j] = 0; } free(tempID); tempID = 0; return VKFFT_ERROR_MALLOC_FAILED; } } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; ++k) { uint64_t t = 0; if (k > 0) { res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (logicalGroupSize != sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize); res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_x, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageSize); res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageRadix); res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\ blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\ inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ if ((stageSize == 1) && (sc->cacheShuffle)) { for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); t++; if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { sprintf(tempNum, "%" PRIu64 "", i); res = VkAddReal(sc, sc->sdataID, tempNum, sc->tshuffle); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", logicalRegistersPerThread); res = VkModReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageSize); res = VkMulReal(sc, sc->sdataID, sc->sdataID, tempNum); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > 1) { res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); if (res != VKFFT_SUCCESS) return res; } res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->inoutID); if (res != VKFFT_SUCCESS) return res; //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "", i, logicalRegistersPerThread, stageSize); if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); if (res != VKFFT_SUCCESS) return res; } res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "] = temp%s%s;\n", i, logicalRegistersPerThread, stageSize, sc->regIDs[id], stageNormalization);*/ } } else { for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); t++; if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { sprintf(tempNum, "%" PRIu64 "", i * stageSize); res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum); if (res != VKFFT_SUCCESS) return res; if ((stageSize <= sc->numSharedBanks / 2) && (sc->fftDim > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != sc->fftDim / sc->registerBoost) && ((sc->fftDim & (sc->fftDim - 1)) == 0) && (stageSize * stageRadix != sc->fftDim)) { if (sc->resolveBankConflictFirstStages == 0) { sc->resolveBankConflictFirstStages = 1; sc->tempLen = sprintf(sc->tempStr, "\ %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->resolveBankConflictFirstStages == 1) { sc->resolveBankConflictFirstStages = 0; sc->tempLen = sprintf(sc->tempStr, "\ %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->localSize[1] > 1) { res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); if (res != VKFFT_SUCCESS) return res; } //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); if (res != VKFFT_SUCCESS) return res; } res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ } } if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (logicalGroupSize != sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); t++; } t = 0; if (sc->registerBoost > 1) { if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThreadNext, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) { for (uint64_t i = 0; i < stageRadixNext; i++) { uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalStoragePerThreadNext / stageRadixNext; id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext; //resID[t + k * sc->registers_per_thread] = sc->regIDs[id]; sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext); res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_x, tempNum); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] > 1) { res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); if (res != VKFFT_SUCCESS) return res; } if (sc->resolveBankConflictFirstStages == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext); res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = sdata[sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/ t++; } } if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } else { res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { //printf("0 - %s\n", resID[i]); sprintf(sc->regIDs[i], "%s", tempID[i]); //sprintf(resID[i], "%s", tempID[i]); //printf("1 - %s\n", resID[i]); } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { free(tempID[i]); tempID[i] = 0; } free(tempID); tempID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; } else { char** tempID; tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); if (tempID) { //resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { tempID[i] = (char*)malloc(sizeof(char) * 50); if (!tempID[i]) { for (uint64_t j = 0; j < i; j++) { free(tempID[j]); tempID[j] = 0; } free(tempID); tempID = 0; return VKFFT_ERROR_MALLOC_FAILED; } } for (uint64_t k = 0; k < sc->registerBoost; ++k) { for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]); } } for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]); } } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { sprintf(sc->regIDs[i], "%s", tempID[i]); } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { free(tempID[i]); tempID[i] = 0; } free(tempID); tempID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; } } else { res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { for (uint64_t i = 0; i < logicalStoragePerThread; i++) { if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization); } if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = temp%s%s;\n", sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);*/ } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } return res; } static inline VkFFTResult appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); #endif char tempNum[50] = ""; uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];//(sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); uint64_t logicalGroupSizeNext = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThreadNext); char stageNormalization[50] = ""; uint64_t normalizationValue = 1; if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { if ((sc->performDCT) && (sc->actualInverse)) { if (sc->performDCT == 1) normalizationValue = (sc->sourceFFTSize - 1) * 2; else normalizationValue = sc->sourceFFTSize * 2; } else normalizationValue = sc->sourceFFTSize; } if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { normalizationValue *= sc->fft_dim_full; } if (normalizationValue != 1) { sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); } if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT))) { res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; } if (stageSize == sc->fftDim / stageRadix) { sc->tempLen = sprintf(sc->tempStr, " %s = %" PRIu64 ";\n", sc->sharedStride, sc->sharedStrideReadWriteConflict); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT))) { //if (sc->writeFromRegisters == 0) { //appendBarrierVkFFT(sc, 2); if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) { char** tempID; tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); if (tempID) { for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { tempID[i] = (char*)malloc(sizeof(char) * 50); if (!tempID[i]) { for (uint64_t j = 0; j < i; j++) { free(tempID[j]); tempID[j] = 0; } free(tempID); tempID = 0; return VKFFT_ERROR_MALLOC_FAILED; } } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t k = 0; k < sc->registerBoost; ++k) { uint64_t t = 0; if (k > 0) { res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (logicalGroupSize != sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize); res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_y, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageSize); res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; sprintf(tempNum, "%" PRIu64 "", stageRadix); res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ stageInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") %% (%" PRIu64 ");\n\ blockInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") - stageInvocationID;\n\ inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); t++; if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { sprintf(tempNum, "%" PRIu64 "", i * stageSize); res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); if (res != VKFFT_SUCCESS) return res; //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); if (res != VKFFT_SUCCESS) return res; } res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); if (res != VKFFT_SUCCESS) return res; } /*sc->tempLen = sprintf(sc->tempStr, "\ sdata[gl_WorkGroupSize.x*(inoutID+%" PRIu64 ")+gl_LocalInvocationID.x] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ } if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (logicalGroupSize != sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); t++; } t = 0; if (sc->registerBoost > 1) { res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThreadNext, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) { for (uint64_t i = 0; i < stageRadixNext; i++) { uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalRegistersPerThreadNext / stageRadixNext; id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext; sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext); res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_y, tempNum); if (res != VKFFT_SUCCESS) return res; res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID); if (res != VKFFT_SUCCESS) return res; res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); if (res != VKFFT_SUCCESS) return res; //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext); res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID); if (res != VKFFT_SUCCESS) return res; /*sc->tempLen = sprintf(sc->tempStr, "\ temp%s = sdata[gl_WorkGroupSize.x*(gl_LocalInvocationID.y+%" PRIu64 ")+gl_LocalInvocationID.x];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/ t++; } } if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } else { res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { sprintf(sc->regIDs[i], "%s", tempID[i]); } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { free(tempID[i]); tempID[i] = 0; } free(tempID); tempID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; } else { char** tempID; tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); if (tempID) { //resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { tempID[i] = (char*)malloc(sizeof(char) * 50); if (!tempID[i]) { for (uint64_t j = 0; j < i; j++) { free(tempID[j]); tempID[j] = 0; } free(tempID); tempID = 0; return VKFFT_ERROR_MALLOC_FAILED; } } for (uint64_t k = 0; k < sc->registerBoost; ++k) { for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { for (uint64_t i = 0; i < stageRadix; i++) { uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]); } } for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]); } } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { sprintf(sc->regIDs[i], "%s", tempID[i]); } for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { free(tempID[i]); tempID[i] = 0; } free(tempID); tempID = 0; } else return VKFFT_ERROR_MALLOC_FAILED; } } else { res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { if (strcmp(stageNormalization, "")) { res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization); } if (res != VKFFT_SUCCESS) return res; } } if (sc->localSize[1] * logicalRegistersPerThread > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } return res; } static inline VkFFTResult appendRadixShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, long double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext, uint64_t stageID, uint64_t shuffleType) { VkFFTResult res = VKFFT_SUCCESS; if (sc->rader_generator[stageID] == 0) { switch (shuffleType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { res = appendRadixShuffleNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { res = appendRadixShuffleStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext); if (res != VKFFT_SUCCESS) return res; //appendBarrierVkFFT(sc, 1); break; } } } return res; } static inline VkFFTResult appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t shuffleType, uint64_t start) { VkFFTResult res = VKFFT_SUCCESS; switch (shuffleType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { uint64_t logicalStoragePerThread; if (start == 1) { logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; } else { logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; } uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread; if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) { for (uint64_t k = 0; k < sc->registerBoost; k++) { if (k > 0) { res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (start == 0) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (start == 1) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } } break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { uint64_t logicalStoragePerThread; if (start == 1) { logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; } else { logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; } uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread; if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) { for (uint64_t k = 0; k < sc->registerBoost; k++) { if (k > 0) { res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (start == 0) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 2); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (start == 1) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; } } break; } } return res; } static inline VkFFTResult appendCoordinateRegisterStore(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) { VkFFTResult res = VKFFT_SUCCESS; if ((!sc->writeFromRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) { switch (readType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c { uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (sc->matrixConvolution == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); } else { sc->tempLen = sprintf(sc->tempStr, "\ switch (coordinate) {\n\ case 0:\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < sc->matrixConvolution; i++) { sc->tempLen = sprintf(sc->tempStr, "\ case %" PRIu64 ":\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s_%" PRIu64 " = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 1; j < used_registers_read; j++) { if (sc->localSize[0] * (j + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s_%" PRIu64 " = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[j], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] * (j + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; break; } case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c { uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (sc->matrixConvolution == 1) { sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); } else { sc->tempLen = sprintf(sc->tempStr, "\ switch (coordinate) {\n\ case 0:\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < sc->matrixConvolution; i++) { sc->tempLen = sprintf(sc->tempStr, "\ case %" PRIu64 ":\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ %s_%" PRIu64 " = sdata[%s*(%s)+%s];\n", sc->regIDs[0], i, sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 1; j < used_registers_read; j++) { if (sc->localSize[1] * (j + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ %s_%" PRIu64 " = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[j], i, sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * (j + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; break; } } } return res; } static inline VkFFTResult appendCoordinateRegisterPull(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) { VkFFTResult res = VKFFT_SUCCESS; if ((!sc->readToRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) { switch (readType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c { uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (sc->matrixConvolution == 1) { sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); } else { sc->tempLen = sprintf(sc->tempStr, "\ switch (coordinate) {\n\ case 0:\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < sc->matrixConvolution; i++) { sc->tempLen = sprintf(sc->tempStr, "\ case %" PRIu64 ":\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * %s + %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 1; j < used_registers_read; j++) { if (sc->localSize[0] * (j + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x, sc->regIDs[j], i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; break; } case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c { uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; if (sc->matrixConvolution == 1) { sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); } else { sc->tempLen = sprintf(sc->tempStr, "\ switch (coordinate) {\n\ case 0:\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < used_registers_read; i++) { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 1; i < sc->matrixConvolution; i++) { sc->tempLen = sprintf(sc->tempStr, "\ case %" PRIu64 ":\n", i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s*(%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 1; j < used_registers_read; j++) { if (sc->localSize[1] * (j + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, "\ sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[j], i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->localSize[1] * (j + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } //appendBarrierVkFFT(sc, 3); sc->tempLen = sprintf(sc->tempStr, " break;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; break; } } } return res; } static inline VkFFTResult appendPreparationBatchedKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) { VkFFTResult res = VKFFT_SUCCESS; char vecType[30]; #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #elif(VKFFT_BACKEND==5) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); #endif char separateRegisterStore[100] = "_store"; for (uint64_t i = 0; i < sc->registers_per_thread; i++) { sc->tempLen = sprintf(sc->tempStr, " %s %s%s;\n", vecType, sc->regIDs[i], separateRegisterStore); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 1; j < sc->matrixConvolution; j++) { sc->tempLen = sprintf(sc->tempStr, " %s %s_%" PRIu64 "%s;\n", vecType, sc->regIDs[i], j, separateRegisterStore); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } for (uint64_t i = 0; i < sc->registers_per_thread; i++) { //sc->tempLen = sprintf(sc->tempStr, " temp%s[i]=temp[i];\n", separateRegisterStore); sc->tempLen = sprintf(sc->tempStr, " %s%s=%s;\n", sc->regIDs[i], separateRegisterStore, sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 1; j < sc->matrixConvolution; j++) { sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 "%s=%s_%" PRIu64 ";\n", sc->regIDs[i], j, separateRegisterStore, sc->regIDs[i], j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " for (%s batchID=0; batchID < %" PRIu64 "; batchID++){\n", uintType, sc->numKernels); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult appendBluesteinConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t dataType) { VkFFTResult res = VKFFT_SUCCESS; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char requestCoordinate[100] = ""; if (sc->convolutionStep) { if (sc->matrixConvolution > 1) { sprintf(requestCoordinate, "0"); } } char requestBatch[100] = ""; char separateRegisterStore[100] = ""; if (sc->convolutionStep) { if (sc->numKernels > 1) { sprintf(requestBatch, "batchID"); sprintf(separateRegisterStore, "_store"); } } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 0; j < sc->matrixConvolution; j++) { sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t used_registers_read = 1; switch (dataType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); break; case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); break; } for (uint64_t i = 0; i < used_registers_read; i++) { switch (dataType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->fftDim == sc->fft_dim_full) { sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); } break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->fftDim == sc->fft_dim_full) { sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } } char kernelName[100] = ""; sprintf(kernelName, "BluesteinConvolutionKernel"); if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full)) sc->tempLen = sprintf(sc->tempStr, " temp_real0 = %s[inoutID].x * %s%s.x + %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_real0 = %s[inoutID].x * %s%s.x - %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full)) sc->tempLen = sprintf(sc->tempStr, " temp_imag0 = %s[inoutID].x * %s%s.y - %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_imag0 = %s[inoutID].x * %s%s.y + %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; switch (dataType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult appendKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) { VkFFTResult res = VKFFT_SUCCESS; char convTypeLeft[20] = ""; char convTypeRight[20] = ""; if ((!strcmp(floatType, "float")) && (strcmp(floatTypeMemory, "float"))) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "float("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "float("); sprintf(convTypeRight, ")"); #endif } if ((!strcmp(floatType, "double")) && (strcmp(floatTypeMemory, "double"))) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "double("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "double("); sprintf(convTypeRight, ")"); #endif } char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char requestCoordinate[100] = ""; if (sc->convolutionStep) { if (sc->matrixConvolution > 1) { sprintf(requestCoordinate, "0"); } } char index_x[2000] = ""; char index_y[2000] = ""; char requestBatch[100] = ""; char separateRegisterStore[100] = ""; if (sc->convolutionStep) { if (sc->numKernels > 1) { sprintf(requestBatch, "batchID"); sprintf(separateRegisterStore, "_store"); } } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; res = VkAppendLineFromInput(sc, sc->disableThreadsStart); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 0; j < sc->matrixConvolution; j++) { sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t used_registers_read = 1; switch (dataType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); break; case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); break; } for (uint64_t i = 0; i < used_registers_read; i++) { if (i > 0) { for (uint64_t j = 0; j < sc->matrixConvolution; j++) { sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " = 0;\n", j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " = 0;\n", j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } switch (dataType) { case 0: { if (sc->fftDim == sc->fft_dim_full) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, i * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputStride[0] > 1) { sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]); uint64_t tempSaveInputOffset = sc->inputOffset; uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; sc->inputOffset = sc->kernelOffset; sc->inputNumberByteSize = sc->kernelNumberByteSize; res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->inputOffset = tempSaveInputOffset; sc->inputNumberByteSize = tempSaveInputNumberByteSize; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch); } else { sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->fftDim, sc->inputStride[1]); uint64_t tempSaveInputOffset = sc->inputOffset; uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; sc->inputOffset = sc->kernelOffset; sc->inputNumberByteSize = sc->kernelNumberByteSize; res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->inputOffset = tempSaveInputOffset; sc->inputNumberByteSize = tempSaveInputNumberByteSize; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch); } } else { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); uint64_t tempSaveInputOffset = sc->inputOffset; uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; sc->inputOffset = sc->kernelOffset; sc->inputNumberByteSize = sc->kernelNumberByteSize; res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->inputOffset = tempSaveInputOffset; sc->inputNumberByteSize = tempSaveInputNumberByteSize; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); } break; } case 1: { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); sprintf(index_y, "(%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")", sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim); uint64_t tempSaveInputOffset = sc->inputOffset; uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; sc->inputOffset = sc->kernelOffset; sc->inputNumberByteSize = sc->kernelNumberByteSize; res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->inputOffset = tempSaveInputOffset; sc->inputNumberByteSize = tempSaveInputNumberByteSize; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((%s%s) %% (%" PRIu64 "), (%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim, requestCoordinate, requestBatch); break; } } char kernelName[100] = ""; sprintf(kernelName, "kernel_obj"); if ((sc->kernelBlockNum == 1) || (sc->useBluesteinFFT)) { for (uint64_t j = 0; j < sc->matrixConvolution; j++) { for (uint64_t l = 0; l < sc->matrixConvolution; l++) { uint64_t k = 0; if (sc->symmetricKernel) { k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); } else { k = (j * sc->matrixConvolution + l); } if (sc->conjugateConvolution == 0) { if (l == 0) sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.x - %s%s[inoutID+%" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.x - %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); } else { if (l == 0) sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.x + %s%s[inoutID+%" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.x + %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t l = 0; l < sc->matrixConvolution; l++) { uint64_t k = 0; if (sc->symmetricKernel) { k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); } else { k = (j * sc->matrixConvolution + l); } if (sc->conjugateConvolution == 0) { if (l == 0) sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y + %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y + %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); } else { if (sc->conjugateConvolution == 1) { if (l == 0) sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x - %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y ;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x - %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); } else { if (l == 0) sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y - %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y - %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); } } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->crossPowerSpectrumNormalization) { #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, " w.x = inversesqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); #elif(VKFFT_BACKEND==1) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); #elif(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0 * w.x;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0 * w.x;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } for (uint64_t l = 1; l < sc->matrixConvolution; l++) { if (sc->crossPowerSpectrumNormalization) { #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, " w.x = inversesqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); #elif(VKFFT_BACKEND==1) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); #elif(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); #endif res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { for (uint64_t j = 0; j < sc->matrixConvolution; j++) { sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t l = 0; l < sc->matrixConvolution; l++) { uint64_t k = 0; if (sc->symmetricKernel) { k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); } else { k = (j * sc->matrixConvolution + l); } if (l == 0) sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s%s.x - %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s_%" PRIu64 "%s.x - %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t l = 0; l < sc->matrixConvolution; l++) { uint64_t k = 0; if (sc->symmetricKernel) { k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); } else { k = (j * sc->matrixConvolution + l); } if (l == 0) sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s%s.y + %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore); else sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s_%" PRIu64 "%s.y + %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; for (uint64_t l = 1; l < sc->matrixConvolution; l++) { sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } switch (dataType) { case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { if (sc->localSize[0] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { if (sc->localSize[1] * (i + 1) > sc->fftDim) { sc->tempLen = sprintf(sc->tempStr, "\ }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } break; } } } res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEnd(sc); if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult setWriteFromRegisters(VkFFTSpecializationConstantsLayout* sc, uint64_t writeType) { VkFFTResult res = VKFFT_SUCCESS; switch (writeType) { case 0: //single_c2c { if ((sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { sc->writeFromRegisters = 0; } else sc->writeFromRegisters = 1; break; } case 1: //grouped_c2c { if ((sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { sc->writeFromRegisters = 0; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } else sc->writeFromRegisters = 1; break; } case 2: //single_c2c_strided { if ((sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { sc->writeFromRegisters = 0; } else sc->writeFromRegisters = 1; break; } case 5://single_r2c { sc->writeFromRegisters = 0; break; } case 6: //single_c2r { if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { sc->writeFromRegisters = 0; } else sc->writeFromRegisters = 1; break; } case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145: { sc->writeFromRegisters = 0; break; } } return res; } static inline VkFFTResult appendWriteDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t writeType) { VkFFTResult res = VKFFT_SUCCESS; long double double_PI = 3.14159265358979323846264338327950288419716939937510L; char vecType[30]; char outputsStruct[20] = ""; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); #if(VKFFT_BACKEND==0) if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (sc->outputBufferBlockNum == 1) sprintf(outputsStruct, "outputs"); else sprintf(outputsStruct, ".outputs"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; #elif(VKFFT_BACKEND==1) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(outputsStruct, "outputs"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; #elif(VKFFT_BACKEND==2) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(outputsStruct, "outputs"); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); sprintf(outputsStruct, "outputs"); //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; #elif(VKFFT_BACKEND==5) sprintf(outputsStruct, "outputs"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; #endif char convTypeLeft[20] = ""; char convTypeRight[20] = ""; if ((!strcmp(floatTypeMemory, "half")) && (strcmp(floatType, "half"))) { if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) { sprintf(convTypeLeft, "float16_t("); sprintf(convTypeRight, ")"); } else { sprintf(convTypeLeft, "f16vec2("); sprintf(convTypeRight, ")"); } } if ((!strcmp(floatTypeMemory, "float")) && (strcmp(floatType, "float"))) { if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "float("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "(float)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "float("); sprintf(convTypeRight, ")"); #endif } else { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "vec2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "conv_float2("); sprintf(convTypeRight, ")"); #endif } } if ((!strcmp(floatTypeMemory, "double")) && (strcmp(floatType, "double"))) { if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "double("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "(double)"); //sprintf(convTypeRight, ""); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "double("); sprintf(convTypeRight, ")"); #endif } else { #if(VKFFT_BACKEND==0) sprintf(convTypeLeft, "dvec2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeft, "conv_double2("); sprintf(convTypeRight, ")"); #endif } } char index_x[2000] = ""; char index_y[2000] = ""; char requestCoordinate[100] = ""; if (sc->convolutionStep) { if (sc->matrixConvolution > 1) { sprintf(requestCoordinate, "coordinate"); } } char requestBatch[100] = ""; if (sc->convolutionStep) { if (sc->numKernels > 1) { sprintf(requestBatch, "batchID");//if one buffer - multiple kernel convolution } } switch (writeType) { case 0: //single_c2c { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->axisSwapped) { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); } char shiftY2[100] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); if (sc->fftDim < sc->fft_dim_full) { if (sc->axisSwapped) { if (!sc->reorderFourStep) { sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!sc->reorderFourStep) { res = VkAppendLineFromInput(sc, sc->disableThreadsStart); } else { sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); } if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " { \n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if (sc->reorderFourStep) { if (sc->fftDim == sc->fft_dim_full) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->localSize[0], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s)/%" PRIu64 "+ (combinedID * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); else sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { //not used if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; /* if (sc->outputBufferBlockNum == 1) if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " %s[indexOutput(inoutID%s%s)] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, convTypeLeft, i, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " %s[indexOutput(inoutID%s%s)] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, convTypeLeft, i, convTypeRight); else if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " outputBlocks[indexOutput(inoutID%s%s) / %" PRIu64 "]%s[indexOutput(inoutID%s%s) %% %" PRIu64 "] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, sc->outputBufferBlockSize, outputsStruct, requestCoordinate, requestBatch, sc->outputBufferBlockSize, convTypeLeft, i, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[indexOutput(inoutID%s%s) / %" PRIu64 "]%s[indexOutput(inoutID%s%s) %% %" PRIu64 "] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, sc->outputBufferBlockSize, outputsStruct, requestCoordinate, requestBatch, sc->outputBufferBlockSize, convTypeLeft, i, convTypeRight); */ if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { if (sc->fftDim == sc->fft_dim_full) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { /*if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else { if (!sc->axisSwapped) sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 " * numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write)); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res;*/ if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ")+(combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput(%s+i*%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { //not used if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_write) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 1: //grouped_c2c { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if ((sc->reorderFourStep) && (sc->stageStartSize == 1)) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * (%" PRIu64 ") + (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%" PRIu64 ") + ((%s%s) / %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * (sc->firstStageStartSize / sc->fftDim)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); res = indexOutputVkFFT(sc, uintType, writeType, index_x, sc->inoutID, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { if (!sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); sprintf(index_y, "%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput((%s%s) %% (%" PRIu64 "), %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim, requestCoordinate, requestBatch); if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 2: //single_c2c_strided { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim, sc->fft_dim_full); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 5://single_r2c { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; for (uint64_t k = 0; k < sc->registerBoost; k++) { if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } } uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]); //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < num_out; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", mult * (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", mult * (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){", mult * (sc->fftDim / 2 + 1) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { //not working yet if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")];\n\ %s = sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->w, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->temp, sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? %s.x+%s.x : %s.y+%s.y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, sc->w, sc->temp, sc->w, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? %s.y-%s.y : -%s.x+%s.x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, sc->w, sc->temp, sc->w, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride];\n\ %s = sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->w, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), sc->temp, sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? %s.x+%s.x : %s.y+%s.y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, sc->w, sc->temp, sc->w, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? %s.y-%s.y : -%s.x+%s.x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, sc->w, sc->temp, sc->w, sc->temp); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; } else { } /*sc->tempLen = sprintf(sc->tempStr, "\ if (%s==%" PRIu64 ") \n\ {\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - 1); sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); sprintf(index_x, "%" PRIu64 "", sc->fftDim / 2); sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY); indexInputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(2 * (%s%s), %" PRIu64 ");\n", sc->gl_GlobalInvocationID_y, shiftY, sc->inputStride[2] / (sc->inputStride[1] + 2)); if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[(%" PRIu64 " + %s * sharedStride)]%s;\n", outputsStruct, convTypeLeft,sc->fftDim / 2, sc->gl_LocalInvocationID_y, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]=%ssdata[(%" PRIu64 " + %s * sharedStride)]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2, sc->gl_LocalInvocationID_y, convTypeRight); VkAppendLine(sc, " }\n");*/ } break; } case 6: //single_c2r { char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY * %" PRIu64 "", sc->localSize[1]); if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; if (sc->reorderFourStep) { //Not implemented } else { if (sc->fftDim == sc->fft_dim_full) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, mult * sc->outputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s.x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s.x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s.y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s.y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride+ (combinedID / %" PRIu64 ")].y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { } } break; } case 110://DCT-I nonstrided { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; sc->fftDim = (sc->fftDim + 2) / 2; for (uint64_t k = 0; k < sc->registerBoost; k++) { if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } } uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[0]); //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < num_out; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s = (sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")]);\n", sc->regIDs[0], sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " %s = (sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]);\n", sc->regIDs[0], sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } else { if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } sc->fftDim = 2 * sc->fftDim - 2; if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; } else { } } break; } case 111://DCT-II strided { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; sc->fftDim = (sc->fftDim + 2) / 2; for (uint64_t k = 0; k < sc->registerBoost; k++) { if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } uint64_t num_out = (uint64_t)ceil(mult * (sc->fftDim) / (double)sc->localSize[1]); //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < num_out; i++) { sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } sc->fftDim = 2 * sc->fftDim - 2; if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; } else { } } break; } case 120://DCT-II nonstrided { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; for (uint64_t k = 0; k < sc->registerBoost; k++) { if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } } uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]); //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < num_out; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = -2*mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*%s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, (double)(-double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*%s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, (double)(-double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", (double)(-double_PI / 2 / sc->fftDim), LFending, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, 2 * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; } else { } } break; } case 121://DCT-II strided { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; for (uint64_t k = 0; k < sc->registerBoost; k++) { if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } uint64_t num_out = (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]); //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread); for (uint64_t i = 0; i < num_out; i++) { sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT3LUT, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = -2*mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*%s(%.17e%s * (combinedID / %" PRIu64 ") );\n", cosDef, (double)(-double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*%s(%.17e%s * (combinedID / %" PRIu64 ") );\n", sinDef, (double)(-double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID / %" PRIu64 ") );\n", (double)(-double_PI / 2 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*mult.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if((combinedID/ %" PRIu64 ")> 0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") * %" PRIu64 " + %s%s;\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->outputStride[1], sc->gl_GlobalInvocationID_x, shiftX); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if(( (%" PRIu64 " - combinedID / %" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||( (%" PRIu64 " - combinedID / %" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fftDim, sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fftDim, sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; } else { } } break; } case 130://DCT-III nonstrided { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } uint64_t maxBluesteinCutOff = 1; if (sc->zeropadBluestein[1]) { if (sc->axisSwapped) maxBluesteinCutOff = sc->fftDim * sc->localSize[0]; else maxBluesteinCutOff = sc->fftDim * sc->localSize[1]; } for (uint64_t k = 0; k < sc->registerBoost; k++) { //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (-2*((combinedID %% %" PRIu64 ") %% 2)+1) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (-2*((combinedID %% %" PRIu64 ") %% 2)+1) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ")* sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(sdata[sdataID].x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->inoutID, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(sdata[sdataID].y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(sdata[sdataID].y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (-2*((combinedID %% %" PRIu64 ") %% 2)+1) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (-2*((combinedID %% %" PRIu64 ") %% 2)+1) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_dim_full; used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } } else { } } break; } case 131://DCT-III strided { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); //uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < used_registers_write; i++) { sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } /*if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; }*/ if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID / %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID / %" PRIu64 ") %% 2)) * ((combinedID / %" PRIu64 ")/2)) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->fftDim - 1, sc->localSize[0], sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } /*if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; }*/ if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_dim_full; used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } } else { } } break; } case 140: //DCT-IV nonstrided as 8N DFT { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->axisSwapped) { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); } else { if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); } char shiftY2[100] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); if (sc->fftDim < sc->fft_dim_full) { if (sc->axisSwapped) { if (!sc->reorderFourStep) { sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { sc->tempLen = sprintf(sc->tempStr, " { \n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //if (sc->reorderFourStep) { if (sc->fftDim == sc->fft_dim_full) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->outputStride[0], sc->fftDim / 8, sc->outputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->fftDim / 8, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[2*(combinedID %% %" PRIu64 ")+1 + (combinedID / %" PRIu64 ") * sharedStride].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[2*(combinedID %% %" PRIu64 ")+1 + (combinedID / %" PRIu64 ") * sharedStride].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } /*else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->localSize[0], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s)/%" PRIu64 "+ (combinedID * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); else sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } }*/ /*} else { if (sc->fftDim == sc->fft_dim_full) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputStride[0] > 1) sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } } else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 " * numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ")+(combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput(%s+i*%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->axisSwapped) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } }*/ sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 141: //DCT-IV strided as 8N DFT { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadStart(sc); if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); if (sc->fftDim != sc->fft_dim_full) sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); else sc->tempLen = sprintf(sc->tempStr, " {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //if ((sc->reorderFourStep) && (sc->stageStartSize == 1)) { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { if (sc->fftDim == sc->fft_dim_full) sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]); else sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * (%" PRIu64 ") + (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%" PRIu64 ") + ((%s%s) / %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * (sc->firstStageStartSize / sc->fftDim)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(inoutID < %" PRIu64 "){\n", sc->fftDim / 8); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); res = indexOutputVkFFT(sc, uintType, writeType, index_x, sc->inoutID, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[%s*(2*(%s+%" PRIu64 ")+1) + %s].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[%s*(2*(%s+%" PRIu64 ")+1) + %s].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } /*} else { for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); sprintf(index_y, "%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput((%s%s) %% (%" PRIu64 "), %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim, requestCoordinate, requestBatch); if (sc->writeFromRegisters) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } }*/ sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; break; } case 142://DCT-IV nonstrided as 2xN/2 DCT-II { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } uint64_t maxBluesteinCutOff = 1; if (sc->zeropadBluestein[1]) { if (sc->axisSwapped) maxBluesteinCutOff = sc->fftDim * sc->localSize[0]; else maxBluesteinCutOff = sc->fftDim * sc->localSize[1]; } for (uint64_t k = 0; k < sc->registerBoost; k++) { //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ")* sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * (1.0%s - 2 * ((combinedID %% %" PRIu64 ")%%2));\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], LFending, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if (sc->size[1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(index_x, "combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")", sc->fftDim, sc->fftDim, sc->outputStride[1]); sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT4LUT, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", cosDef, (double)(-double_PI / 8 / sc->fftDim), LFending, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", sinDef, (double)(-double_PI / 8 / sc->fftDim), LFending, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", (double)(-double_PI / 8 / sc->fftDim), LFending, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(index_x, "%" PRIu64 " - combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")", 2 * sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->outputStride[1]); sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { if (sc->size[1] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (sc->size[1] % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_dim_full; used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } } else { } } break; } case 143://DCT-IV strided as 2xN/2 DCT-II { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftX2[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID / %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID / %" PRIu64 ") %% 2)) * ((combinedID / %" PRIu64 ")/2)) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->fftDim - 1, sc->localSize[0], sc->localSize[0], sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * (1.0%s - 2 * ((combinedID / %" PRIu64 ")%%2));\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_write) * sc->localSize[1]); res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT4LUT, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (2*(combinedID / %" PRIu64 ")+1) );\n", cosDef, (double)(-double_PI / 8 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (2*(combinedID / %" PRIu64 ")+1) );\n", sinDef, (double)(-double_PI / 8 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (2*(combinedID / %" PRIu64 ")+1) );\n", (double)(-double_PI / 8 / sc->fftDim), LFending, sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); sprintf(index_y, "(%" PRIu64 " - (%s + %" PRIu64 "))", 2 * sc->fftDim - 1, sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_write) * sc->localSize[1]); res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropadBluestein[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_dim_full; used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } } else { } } break; } case 144://odd DCT-IV nonstrided as N FFT { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX "); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "\ if (%s==0)\n\ {\n\ sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; } } //uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]); //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); for (uint64_t i = 0; i < mult * used_registers_write; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", mult * sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", mult * sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * sc->fftDim * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; if (sc->writeFromRegisters) { //not working yet if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " sdataID = combinedID %% %" PRIu64 ";\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(sdataID < %" PRIu64 "){\n", sc->fftDim / 4); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!sc->axisSwapped) sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], sc->fftDim); else sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID + 1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x += %s.y;\n\ else\n\ %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", sc->fftDim / 2, sc->fftDim / 4); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!sc->axisSwapped) sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); else sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x -= %s.y;\n\ else\n\ %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4, sc->fftDim / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!sc->axisSwapped) sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); else sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x += %s.y;\n\ else\n\ %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->mergeSequencesR2C) { if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if (!sc->axisSwapped) sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->fftDim); else sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x -= %s.y;\n\ else\n\ %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x *= 1.41421356237309504880%s;\n", sc->regIDs[1], LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s.x%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->axisSwapped) { if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (sc->axisSwapped) { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } else { if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } } if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_dim_full; used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); } /*for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { if (sc->localSize[1] == 1) sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); else sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(%s + %" PRIu64 " < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], (sc->fftDim-1)/2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(combinedID %% %" PRIu64 ")+1)* sharedStride + (combinedID / %" PRIu64 ")];\n",sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(combinedID %% %" PRIu64 ")+2)* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(combinedID %% %" PRIu64 ")+1) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(combinedID %% %" PRIu64 ")+2) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }else{\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->axisSwapped) { sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")-1)* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 "))* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")-1) + (combinedID / %" PRIu64 ")* sharedStride];\n", sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ")* sharedStride];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim, sc->fftDim); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } }*/ } else { } } break; } case 145://odd DCT-IV strided as N FFT { if (!sc->writeFromRegisters) { res = appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) return res; } //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; char shiftX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); char shiftY[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); char shiftY2[500] = ""; if (sc->performWorkGroupShift[1]) sprintf(shiftY2, " + consts.workGroupShiftY "); //uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; if (sc->reorderFourStep) { //Not implemented } else { //appendBarrierVkFFT(sc, 1); //appendZeropadStart(sc); if (sc->fftDim == sc->fft_dim_full) { if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } for (uint64_t k = 0; k < sc->registerBoost; k++) { for (uint64_t i = 0; i < used_registers_write; i++) { sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > sc->fftDim * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = appendZeropadStartReadWriteStage(sc, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " sdataID = combinedID / %" PRIu64 ";\n", sc->localSize[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if(sdataID < %" PRIu64 "){\n", sc->fftDim / 4); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) * sharedStride + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID + 1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x += %s.y;\n\ else\n\ %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", sc->fftDim / 2, sc->fftDim / 4); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + %s];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x -= %s.y;\n\ else\n\ %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4, sc->fftDim / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") * sharedStride + %s];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x += %s.y;\n\ else\n\ %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if((sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + %s];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->gl_LocalInvocationID_x); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ %s.x = -%s.x;\n\ else\n\ %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ %s.x -= %s.y;\n\ else\n\ %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " }\n\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s.x *= 1.41421356237309504880%s;\n", sc->regIDs[1], LFending); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->regIDs[1], convTypeRight); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s.x%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], convTypeRight); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->zeropad[1]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } res = appendZeropadEndReadWriteStage(sc); if (res != VKFFT_SUCCESS) return res; if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > sc->fftDim * sc->localSize[0]) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->size[0] % sc->localSize[0] != 0) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } } if (sc->zeropadBluestein[1]) { sc->fftDim = sc->fft_dim_full; used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); } } else { } } break; } } //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; return res; } static inline VkFFTResult shaderGenVkFFT_R2C_decomposition(char* output, VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory, const char* uintType, uint64_t type) { VkFFTResult res = VKFFT_SUCCESS; //appendLicense(output); if (!sc->disableSetLocale) { const char* loc_oldLocale = setlocale(LC_ALL, NULL); strcpy(sc->oldLocale, loc_oldLocale); setlocale(LC_ALL, "C"); } sc->output = output; sc->tempStr = (char*)malloc(sizeof(char) * sc->maxTempLength); if (!sc->tempStr) return VKFFT_ERROR_MALLOC_FAILED; sc->tempLen = 0; sc->currentLen = 0; char vecType[30]; char vecTypeInput[30]; char vecTypeOutput[30]; char inputsStruct[20] = ""; char outputsStruct[20] = ""; char LFending[4] = ""; if (!strcmp(floatType, "float")) sprintf(LFending, "f"); char uintType_32[30]; #if(VKFFT_BACKEND==0) sprintf(uintType_32, "unsigned int"); if (sc->inputBufferBlockNum == 1) sprintf(inputsStruct, "inputs"); else sprintf(inputsStruct, ".inputs"); if (sc->outputBufferBlockNum == 1) sprintf(outputsStruct, "outputs"); else sprintf(outputsStruct, ".outputs"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "vec2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "dvec2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "vec2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "dvec2"); sprintf(sc->gl_LocalInvocationID_x, "gl_LocalInvocationID.x"); sprintf(sc->gl_LocalInvocationID_y, "gl_LocalInvocationID.y"); sprintf(sc->gl_LocalInvocationID_z, "gl_LocalInvocationID.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "gl_GlobalInvocationID.x"); sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.x"); sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.y * gl_WorkGroupSize.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(gl_LocalInvocationID.y + gl_WorkGroupID.x * gl_WorkGroupSize.y)"); sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.y"); sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.x"); sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.z * gl_WorkGroupSize.x)"); sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); sprintf(sc->gl_GlobalInvocationID_z, "(gl_LocalInvocationID.z + gl_WorkGroupID.x * gl_WorkGroupSize.z)"); sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.z"); sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "gl_WorkGroupSize.x"); sprintf(sc->gl_WorkGroupSize_y, "gl_WorkGroupSize.y"); sprintf(sc->gl_WorkGroupSize_z, "gl_WorkGroupSize.z"); sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; #elif(VKFFT_BACKEND==1) sprintf(uintType_32, "unsigned int"); sprintf(inputsStruct, "inputs"); sprintf(outputsStruct, "outputs"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x"); sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y"); sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.y * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.x * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.z * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.x * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "blockDim.x"); sprintf(sc->gl_WorkGroupSize_y, "blockDim.y"); sprintf(sc->gl_WorkGroupSize_z, "blockDim.z"); sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; #elif(VKFFT_BACKEND==2) sprintf(uintType_32, "unsigned int"); sprintf(inputsStruct, "inputs"); sprintf(outputsStruct, "outputs"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x"); sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y"); sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.y * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.x * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.z * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.x * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "blockDim.x"); sprintf(sc->gl_WorkGroupSize_y, "blockDim.y"); sprintf(sc->gl_WorkGroupSize_z, "blockDim.z"); sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); if (!strcmp(floatType, "double")) sprintf(LFending, "l"); char cosDef[20] = "__cosf"; char sinDef[20] = "__sinf"; #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(uintType_32, "unsigned int"); sprintf(inputsStruct, "inputs"); sprintf(outputsStruct, "outputs"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "get_local_id(0)"); sprintf(sc->gl_LocalInvocationID_y, "get_local_id(1)"); sprintf(sc->gl_LocalInvocationID_z, "get_local_id(2)"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "get_global_id(0)"); sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); sprintf(sc->gl_WorkGroupID_x, "get_group_id(0)"); sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(1) * get_local_size(0))"); sprintf(sc->gl_GlobalInvocationID_y, "(get_local_id(1) + get_group_id(0) * get_local_size(1))"); sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); sprintf(sc->gl_WorkGroupID_x, "get_group_id(1)"); sprintf(sc->gl_WorkGroupID_y, "get_group_id(0)"); sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(2) * get_local_size(0))"); sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); sprintf(sc->gl_GlobalInvocationID_z, "(get_local_id(2) + get_group_id(0) * get_local_size(2))"); sprintf(sc->gl_WorkGroupID_x, "get_group_id(2)"); sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); sprintf(sc->gl_WorkGroupID_z, "get_group_id(0)"); break; } sprintf(sc->gl_WorkGroupSize_x, "get_local_size(0)"); sprintf(sc->gl_WorkGroupSize_y, "get_local_size(1)"); sprintf(sc->gl_WorkGroupSize_z, "get_local_size(2)"); //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); char cosDef[20] = "native_cos"; char sinDef[20] = "native_sin"; #elif(VKFFT_BACKEND==5) sprintf(uintType_32, "uint"); sprintf(inputsStruct, "inputs"); sprintf(outputsStruct, "outputs"); if (!strcmp(floatType, "half")) sprintf(vecType, "half2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "half2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "half2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "thread_position_in_threadgroup.x"); sprintf(sc->gl_LocalInvocationID_y, "thread_position_in_threadgroup.y"); sprintf(sc->gl_LocalInvocationID_z, "thread_position_in_threadgroup.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "thread_position_in_grid.x"); sprintf(sc->gl_GlobalInvocationID_y, "thread_position_in_grid.y"); sprintf(sc->gl_GlobalInvocationID_z, "thread_position_in_grid.z"); sprintf(sc->gl_WorkGroupID_x, "threadgroup_position_in_grid.x"); sprintf(sc->gl_WorkGroupID_y, "threadgroup_position_in_grid.y"); sprintf(sc->gl_WorkGroupID_z, "threadgroup_position_in_grid.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.y * %" PRIu64 ")", sc->localSize[0]); sprintf(sc->gl_GlobalInvocationID_y, "(thread_position_in_threadgroup.y + threadgroup_position_in_grid.x * %" PRIu64 ")", sc->localSize[1]); sprintf(sc->gl_GlobalInvocationID_z, "thread_position_in_threadgroup.z"); sprintf(sc->gl_WorkGroupID_x, "threadgroup_position_in_grid.y"); sprintf(sc->gl_WorkGroupID_y, "threadgroup_position_in_grid.x"); sprintf(sc->gl_WorkGroupID_z, "threadgroup_position_in_grid.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.z * %" PRIu64 ")", sc->localSize[0]); sprintf(sc->gl_GlobalInvocationID_y, "thread_position_in_threadgroup.y"); sprintf(sc->gl_GlobalInvocationID_z, "(thread_position_in_threadgroup.z + threadgroup_position_in_grid.x * %" PRIu64 ")", sc->localSize[2]); sprintf(sc->gl_WorkGroupID_x, "threadgroup_position_in_grid.z"); sprintf(sc->gl_WorkGroupID_y, "threadgroup_position_in_grid.y"); sprintf(sc->gl_WorkGroupID_z, "threadgroup_position_in_grid.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "%" PRIu64 "", sc->localSize[0]); sprintf(sc->gl_WorkGroupSize_y, "%" PRIu64 "", sc->localSize[1]); sprintf(sc->gl_WorkGroupSize_z, "%" PRIu64 "", sc->localSize[2]); //sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); //sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); char cosDef[20] = "cos"; char sinDef[20] = "sin"; #endif sprintf(sc->vecType, "%s", vecType); sprintf(sc->stageInvocationID, "stageInvocationID"); sprintf(sc->blockInvocationID, "blockInvocationID"); sprintf(sc->tshuffle, "tshuffle"); sprintf(sc->sharedStride, "sharedStride"); sprintf(sc->combinedID, "combinedID"); sprintf(sc->inoutID, "inoutID"); sprintf(sc->sdataID, "sdataID"); char convTypeLeftInput[20] = ""; char convTypeRightInput[20] = ""; if ((!strcmp(floatType, "float")) && (strcmp(floatTypeInputMemory, "float"))) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeftInput, "vec2("); sprintf(convTypeRightInput, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeftInput, "conv_float2("); sprintf(convTypeRightInput, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeftInput, "conv_float2("); sprintf(convTypeRightInput, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeftInput, "conv_float2("); sprintf(convTypeRightInput, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeftInput, "conv_float2("); sprintf(convTypeRightInput, ")"); #endif } if ((!strcmp(floatType, "double")) && (strcmp(floatTypeInputMemory, "double"))) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeftInput, "dvec2("); sprintf(convTypeRightInput, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeftInput, "conv_double2("); sprintf(convTypeRightInput, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeftInput, "conv_double2("); sprintf(convTypeRightInput, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeftInput, "conv_double2("); sprintf(convTypeRightInput, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeftInput, "conv_double2("); sprintf(convTypeRightInput, ")"); #endif } char convTypeLeftOutput[20] = ""; char convTypeRightOutput[20] = ""; if ((!strcmp(floatTypeOutputMemory, "half")) && (strcmp(floatType, "half"))) { sprintf(convTypeLeftOutput, "f16vec2("); sprintf(convTypeRightOutput, ")"); } if ((!strcmp(floatTypeOutputMemory, "float")) && (strcmp(floatType, "float"))) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeftOutput, "vec2("); sprintf(convTypeRightOutput, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeftOutput, "conv_float2"); sprintf(convTypeRightOutput, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeftOutput, "conv_float2"); sprintf(convTypeRightOutput, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeftOutput, "conv_float2("); sprintf(convTypeRightOutput, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeftOutput, "conv_float2("); sprintf(convTypeRightOutput, ")"); #endif } if ((!strcmp(floatTypeOutputMemory, "double")) && (strcmp(floatType, "double"))) { #if(VKFFT_BACKEND==0) sprintf(convTypeLeftOutput, "dvec2("); sprintf(convTypeRightOutput, ")"); #elif(VKFFT_BACKEND==1) sprintf(convTypeLeftOutput, "conv_double2("); sprintf(convTypeRightOutput, ")"); #elif(VKFFT_BACKEND==2) sprintf(convTypeLeftOutput, "conv_double2("); sprintf(convTypeRightOutput, ")"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(convTypeLeftOutput, "conv_double2("); sprintf(convTypeRightOutput, ")"); #elif(VKFFT_BACKEND==5) sprintf(convTypeLeftOutput, "conv_double2("); sprintf(convTypeRightOutput, ")"); #endif } //sprintf(sc->tempReg, "temp"); res = appendVersion(sc); if (res != VKFFT_SUCCESS) return res; res = appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory); if (res != VKFFT_SUCCESS) return res; res = appendLayoutVkFFT(sc); if (res != VKFFT_SUCCESS) return res; res = appendConstantsVkFFT(sc, floatType, uintType); if (res != VKFFT_SUCCESS) return res; if (((!sc->LUT) || (!sc->LUT_4step)) && (!strcmp(floatType, "double"))) { res = appendSinCos20(sc, floatType, uintType); if (res != VKFFT_SUCCESS) return res; } if (strcmp(floatType, floatTypeInputMemory)) { res = appendConversion(sc, floatType, floatTypeInputMemory); if (res != VKFFT_SUCCESS) return res; } if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) { res = appendConversion(sc, floatType, floatTypeOutputMemory); if (res != VKFFT_SUCCESS) return res; } res = appendPushConstantsVkFFT(sc, floatType, uintType); if (res != VKFFT_SUCCESS) return res; uint64_t id = 0; res = appendInputLayoutVkFFT(sc, id, floatTypeInputMemory, 0); if (res != VKFFT_SUCCESS) return res; id++; res = appendOutputLayoutVkFFT(sc, id, floatTypeOutputMemory, 0); if (res != VKFFT_SUCCESS) return res; id++; if (sc->convolutionStep) { res = appendKernelLayoutVkFFT(sc, id, floatTypeKernelMemory); if (res != VKFFT_SUCCESS) return res; id++; } if (sc->LUT) { res = appendLUTLayoutVkFFT(sc, id, floatType); if (res != VKFFT_SUCCESS) return res; id++; } //appendIndexInputVkFFT(sc, uintType, type); //appendIndexOutputVkFFT(sc, uintType, type); /*uint64_t appendedRadix[10] = { 0,0,0,0,0,0,0,0,0,0 }; for (uint64_t i = 0; i < sc->numStages; i++) { if (appendedRadix[sc->stageRadix[i]] == 0) { appendedRadix[sc->stageRadix[i]] = 1; appendRadixKernelVkFFT(sc, floatType, uintType, sc->stageRadix[i]); } }*/ #if(VKFFT_BACKEND==0) sc->tempLen = sprintf(sc->tempStr, "void main() {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; #elif(VKFFT_BACKEND==1) sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __global__ __launch_bounds__(%" PRIu64 ") void VkFFT_main_R2C ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); #elif(VKFFT_BACKEND==2) if (!sc->useUint64 && sc->useStrict32BitAddress > 0) { // These wrappers help hipcc to generate faster code for load and store operations where // 64-bit scalar + 32-bit vector registers are used instead of 64-bit vector saving a few // instructions for computing 64-bit vector addresses. sc->tempLen = sprintf(sc->tempStr, "template\n" "struct Inputs\n" "{\n" " const T* buffer;\n" " inline __device__ Inputs(const T* buffer) : buffer(buffer) {}\n" " inline __device__ const T& operator[](unsigned int idx) const { return *reinterpret_cast(reinterpret_cast(buffer) + idx * static_cast(sizeof(T))); }\n" "};\n" "template\n" "struct Outputs\n" "{\n" " T* buffer;\n" " inline __device__ Outputs(T* buffer) : buffer(buffer) {}\n" " inline __device__ T& operator[](unsigned int idx) const { return *reinterpret_cast(reinterpret_cast(buffer) + idx * static_cast(sizeof(T))); }\n" "};\n" ); } else { sc->tempLen = sprintf(sc->tempStr, "template\n" "using Inputs = const T*;\n" "template\n" "using Outputs = T*;\n" ); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __launch_bounds__(%" PRIu64 ") __global__ void VkFFT_main_R2C ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInput, vecTypeOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "__kernel __attribute__((reqd_work_group_size(%" PRIu64 ", %" PRIu64 ", %" PRIu64 "))) void VkFFT_main_R2C ", sc->localSize[0], sc->localSize[1], sc->localSize[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->pushConstantsStructSize > 0) { sc->tempLen = sprintf(sc->tempStr, ", PushConsts consts"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "kernel void VkFFT_main_R2C "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "(%s3 thread_position_in_grid [[thread_position_in_grid]], ", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], ", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], ", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInput, vecTypeOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; int args_id = 2; if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", constant %s* kernel_obj[[buffer(%d)]]", vecType, args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; args_id++; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", constant %s* twiddleLUT[[buffer(%d)]]", vecType, args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; args_id++; } if (sc->pushConstantsStructSize > 0) { sc->tempLen = sprintf(sc->tempStr, ", constant PushConsts& consts[[buffer(%d)]]", args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; args_id++; } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); #endif char index_x[2000] = ""; char idX[500] = ""; if (sc->performWorkGroupShift[0]) sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); else sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); //res = appendZeropadStart(sc); //if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s id_x = %s %% %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0)); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s id_y = (%s / %" PRIu64 ") %% %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0), sc->size[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s id_z = (%s / %" PRIu64 ") / %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0), sc->size[1]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "if (%s < %" PRIu64 "){\n", idX, (uint64_t)ceil(sc->size[0] / 4.0) * sc->size[1] * sc->size[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s inoutID = ", uintType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "id_x + id_y*%" PRIu64 " +id_z*%" PRIu64 "", sc->inputStride[1], sc->inputStride[2]); res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s inoutID2;\n", uintType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s inoutID3;\n", uintType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s t0 = %s%s[inoutID]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput); else sc->tempLen = sprintf(sc->tempStr, " %s t0 = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", vecType, convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s tf;\n", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->size[0] % 4 == 0) { sc->tempLen = sprintf(sc->tempStr, "if (id_x == 0) {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID2 = "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "%" PRIu64 " + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]); res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID3 = "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "%" PRIu64 " + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (uint64_t)ceil(sc->size[0] / 4.0), sc->inputStride[1], sc->inputStride[2]); res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " tf = %s%s[inoutID3]%s;\n", convTypeLeftInput, inputsStruct, convTypeRightInput); else sc->tempLen = sprintf(sc->tempStr, " tf = %sinputBlocks[inoutID3 / %" PRIu64 "]%s[inoutID3 %% %" PRIu64 "]%s;\n", convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "} else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " inoutID2 = "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%" PRIu64 "-id_x) + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]); res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, "inoutID2 = "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sprintf(index_x, "(%" PRIu64 "-id_x) + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]); res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, ";\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->inputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s t1 = %s%s[inoutID2]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput); else sc->tempLen = sprintf(sc->tempStr, " %s t1 = %sinputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "]%s;\n", vecType, convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s t2;\n", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " %s t3;\n", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "if (id_x == 0) {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->size[0] % 4 == 0) { if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " t2.x = t0.x+t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t2.y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t3.x = t0.x-t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t3.y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " t2.x = (t0.x+t1.x);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t2.y = (t0.x-t1.x);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } sc->tempLen = sprintf(sc->tempStr, " tf.y = -tf.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->inverse) { res = VkMulComplexNumber(sc, "tf", "tf", "2"); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st2%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st3%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID3] = %stf%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID3 / %" PRIu64 "]%s[inoutID3 %% %" PRIu64 "] = %stf%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " t2.x = t0.x+t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t2.y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t3.x = t0.x-t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t3.y = 0;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " t2.x = (t0.x+t1.x);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t2.y = (t0.x-t1.x);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st2%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st3%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } sc->tempLen = sprintf(sc->tempStr, "} else {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; res = VkAddComplex(sc, "t2", "t0", "t1"); if (res != VKFFT_SUCCESS) return res; res = VkSubComplex(sc, "t3", "t0", "t1"); if (res != VKFFT_SUCCESS) return res; if (!sc->inverse) { res = VkMulComplexNumber(sc, "t2", "t2", "0.5"); if (res != VKFFT_SUCCESS) return res; res = VkMulComplexNumber(sc, "t3", "t3", "0.5"); if (res != VKFFT_SUCCESS) return res; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, " tf = twiddleLUT[id_x];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " %s angle = (%.17e*id_x)/%" PRIu64 ";\n", floatType, 3.1415926535897932384626433832795, sc->size[0] / 2); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (!strcmp(floatType, "float")) { sc->tempLen = sprintf(sc->tempStr, " tf.x = %s(angle);\n", cosDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " tf.y = %s(angle);\n", sinDef); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } if (!strcmp(floatType, "double")) { sc->tempLen = sprintf(sc->tempStr, " tf = sincos_20(angle);\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } } if (!sc->inverse) { sc->tempLen = sprintf(sc->tempStr, " t0.x = tf.x*t2.y-tf.y*t3.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t0.y = -tf.y*t2.y-tf.x*t3.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x-t0.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y+t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x+t0.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y+t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } else { sc->tempLen = sprintf(sc->tempStr, " t0.x = tf.x*t2.y+tf.y*t3.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t0.y = -tf.y*t2.y+tf.x*t3.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x+t0.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y+t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x-t0.x;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y+t0.y;\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; } //sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x+tf.x*t2.y-tf.y*t3.x;\n"); //sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y-tf.y*t2.y-tf.x*t3.x;\n"); //sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x-tf.x*t2.y+tf.y*t3.x;\n"); //sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y-tf.y*t2.y-tf.x*t3.x;\n"); if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st0%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st0%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; if (sc->outputBufferBlockNum == 1) sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st1%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); else sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st1%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //res = appendZeropadEnd(sc); //if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //printf("%s", output); return res; } static inline void freeShaderGenVkFFT(VkFFTSpecializationConstantsLayout* sc) { if (sc->tempStr) { free(sc->tempStr); sc->tempStr = 0; } if (sc->disableThreadsStart) { free(sc->disableThreadsStart); sc->disableThreadsStart = 0; } if (sc->disableThreadsEnd) { free(sc->disableThreadsEnd); sc->disableThreadsEnd = 0; } if (sc->regIDs) { for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { if (sc->regIDs[i]) { free(sc->regIDs[i]); sc->regIDs[i] = 0; } } free(sc->regIDs); sc->regIDs = 0; } if (!sc->disableSetLocale) { if (!strcmp(sc->oldLocale, "")) { setlocale(LC_ALL, sc->oldLocale); memset(sc->oldLocale, 0, 100 * sizeof(char)); } } if (sc->numRaderPrimes) { sc->currentRaderContainer = 0; } } static inline VkFFTResult shaderGenVkFFT(char* output, VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory, const char* uintType, uint64_t type) { VkFFTResult res = VKFFT_SUCCESS; //appendLicense(output); if (!sc->disableSetLocale) { const char* loc_oldLocale = setlocale(LC_ALL, NULL); strcpy(sc->oldLocale, loc_oldLocale); setlocale(LC_ALL, "C"); } sc->output = output; sc->tempStr = (char*)malloc(sizeof(char) * sc->maxTempLength); if (!sc->tempStr) return VKFFT_ERROR_MALLOC_FAILED; sc->tempLen = 0; sc->currentLen = 0; char vecType[30]; char vecTypeInput[30]; char vecTypeOutput[30]; char uintType_32[30]; #if(VKFFT_BACKEND==0) sprintf(uintType_32, "uint"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "vec2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "dvec2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "vec2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "dvec2"); sprintf(sc->gl_LocalInvocationID_x, "gl_LocalInvocationID.x"); sprintf(sc->gl_LocalInvocationID_y, "gl_LocalInvocationID.y"); sprintf(sc->gl_LocalInvocationID_z, "gl_LocalInvocationID.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "gl_GlobalInvocationID.x"); sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.x"); sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.y * gl_WorkGroupSize.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(gl_LocalInvocationID.y + gl_WorkGroupID.x * gl_WorkGroupSize.y)"); sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.y"); sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.x"); sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.z * gl_WorkGroupSize.x)"); sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); sprintf(sc->gl_GlobalInvocationID_z, "(gl_LocalInvocationID.z + gl_WorkGroupID.x * gl_WorkGroupSize.z)"); sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.z"); sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "gl_WorkGroupSize.x"); sprintf(sc->gl_WorkGroupSize_y, "gl_WorkGroupSize.y"); sprintf(sc->gl_WorkGroupSize_z, "gl_WorkGroupSize.z"); sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); #elif(VKFFT_BACKEND==1) sprintf(uintType_32, "unsigned int"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x"); sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y"); sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.y * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.x * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.z * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.x * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "blockDim.x"); sprintf(sc->gl_WorkGroupSize_y, "blockDim.y"); sprintf(sc->gl_WorkGroupSize_z, "blockDim.z"); sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); #elif(VKFFT_BACKEND==2) sprintf(uintType_32, "unsigned int"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x"); sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y"); sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.y * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.x * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.z * blockDim.x)"); sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.x * blockDim.z)"); sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "blockDim.x"); sprintf(sc->gl_WorkGroupSize_y, "blockDim.y"); sprintf(sc->gl_WorkGroupSize_z, "blockDim.z"); sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sprintf(uintType_32, "unsigned int"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "get_local_id(0)"); sprintf(sc->gl_LocalInvocationID_y, "get_local_id(1)"); sprintf(sc->gl_LocalInvocationID_z, "get_local_id(2)"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "get_global_id(0)"); sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); sprintf(sc->gl_WorkGroupID_x, "get_group_id(0)"); sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(1) * get_local_size(0))"); sprintf(sc->gl_GlobalInvocationID_y, "(get_local_id(1) + get_group_id(0) * get_local_size(1))"); sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); sprintf(sc->gl_WorkGroupID_x, "get_group_id(1)"); sprintf(sc->gl_WorkGroupID_y, "get_group_id(0)"); sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(2) * get_local_size(0))"); sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); sprintf(sc->gl_GlobalInvocationID_z, "(get_local_id(2) + get_group_id(0) * get_local_size(2))"); sprintf(sc->gl_WorkGroupID_x, "get_group_id(2)"); sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); sprintf(sc->gl_WorkGroupID_z, "get_group_id(0)"); break; } sprintf(sc->gl_WorkGroupSize_x, "get_local_size(0)"); sprintf(sc->gl_WorkGroupSize_y, "get_local_size(1)"); sprintf(sc->gl_WorkGroupSize_z, "get_local_size(2)"); #elif(VKFFT_BACKEND==5) sprintf(uintType_32, "uint"); if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "half2"); if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "half2"); if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); sprintf(sc->gl_LocalInvocationID_x, "thread_position_in_threadgroup.x"); sprintf(sc->gl_LocalInvocationID_y, "thread_position_in_threadgroup.y"); sprintf(sc->gl_LocalInvocationID_z, "thread_position_in_threadgroup.z"); switch (sc->swapComputeWorkGroupID) { case 0: sprintf(sc->gl_GlobalInvocationID_x, "thread_position_in_grid.x"); sprintf(sc->gl_GlobalInvocationID_y, "thread_position_in_grid.y"); sprintf(sc->gl_GlobalInvocationID_z, "thread_position_in_grid.z"); sprintf(sc->gl_WorkGroupID_x, "threadgroup_position_in_grid.x"); sprintf(sc->gl_WorkGroupID_y, "threadgroup_position_in_grid.y"); sprintf(sc->gl_WorkGroupID_z, "threadgroup_position_in_grid.z"); break; case 1: sprintf(sc->gl_GlobalInvocationID_x, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.y * %" PRIu64 ")", sc->localSize[0]); sprintf(sc->gl_GlobalInvocationID_y, "(thread_position_in_threadgroup.y + threadgroup_position_in_grid.x * %" PRIu64 ")", sc->localSize[1]); sprintf(sc->gl_GlobalInvocationID_z, "thread_position_in_threadgroup.z"); sprintf(sc->gl_WorkGroupID_x, "threadgroup_position_in_grid.y"); sprintf(sc->gl_WorkGroupID_y, "threadgroup_position_in_grid.x"); sprintf(sc->gl_WorkGroupID_z, "threadgroup_position_in_grid.z"); break; case 2: sprintf(sc->gl_GlobalInvocationID_x, "(thread_position_in_threadgroup.x + threadgroup_position_in_grid.z * %" PRIu64 ")", sc->localSize[0]); sprintf(sc->gl_GlobalInvocationID_y, "thread_position_in_threadgroup.y"); sprintf(sc->gl_GlobalInvocationID_z, "(thread_position_in_threadgroup.z + threadgroup_position_in_grid.x * %" PRIu64 ")", sc->localSize[2]); sprintf(sc->gl_WorkGroupID_x, "threadgroup_position_in_grid.z"); sprintf(sc->gl_WorkGroupID_y, "threadgroup_position_in_grid.y"); sprintf(sc->gl_WorkGroupID_z, "threadgroup_position_in_grid.x"); break; } sprintf(sc->gl_WorkGroupSize_x, "%" PRIu64 "", sc->localSize[0]); sprintf(sc->gl_WorkGroupSize_y, "%" PRIu64 "", sc->localSize[1]); sprintf(sc->gl_WorkGroupSize_z, "%" PRIu64 "", sc->localSize[2]); //sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); //sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); #endif sprintf(sc->vecType, "%s", vecType); sprintf(sc->stageInvocationID, "stageInvocationID"); sprintf(sc->blockInvocationID, "blockInvocationID"); sprintf(sc->tshuffle, "tshuffle"); sprintf(sc->sharedStride, "sharedStride"); sprintf(sc->combinedID, "combinedID"); sprintf(sc->inoutID, "inoutID"); sprintf(sc->sdataID, "sdataID"); sprintf(sc->raderIDx, "raderIDx"); sprintf(sc->raderIDx2, "raderIDx2"); //sprintf(sc->tempReg, "temp"); sc->disableThreadsStart = (char*)malloc(sizeof(char) * 500); if (!sc->disableThreadsStart) { freeShaderGenVkFFT(sc); return VKFFT_ERROR_MALLOC_FAILED; } sc->disableThreadsEnd = (char*)malloc(sizeof(char) * 2); if (!sc->disableThreadsEnd) { freeShaderGenVkFFT(sc); return VKFFT_ERROR_MALLOC_FAILED; } sc->disableThreadsStart[0] = 0; sc->disableThreadsEnd[0] = 0; res = appendVersion(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } res = appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } res = appendLayoutVkFFT(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } res = appendConstantsVkFFT(sc, floatType, uintType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (((!sc->LUT) || (!sc->LUT_4step)) && (!strcmp(floatType, "double"))) { res = appendSinCos20(sc, floatType, uintType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (strcmp(floatType, floatTypeInputMemory)) { res = appendConversion(sc, floatType, floatTypeInputMemory); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) { res = appendConversion(sc, floatType, floatTypeOutputMemory); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } res = appendPushConstantsVkFFT(sc, floatType, uintType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } uint64_t id = 0; res = appendInputLayoutVkFFT(sc, id, floatTypeInputMemory, type); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } id++; res = appendOutputLayoutVkFFT(sc, id, floatTypeOutputMemory, type); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } id++; if (sc->convolutionStep) { res = appendKernelLayoutVkFFT(sc, id, floatTypeKernelMemory); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } id++; } if (sc->LUT) { res = appendLUTLayoutVkFFT(sc, id, floatType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } id++; } if (sc->raderUintLUT) { res = appendRaderUintLUTLayoutVkFFT(sc, id); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } id++; } if (sc->useBluesteinFFT) { res = appendBluesteinLayoutVkFFT(sc, id, floatType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (sc->BluesteinConvolutionStep) id++; if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) id++; } //appendIndexInputVkFFT(sc, uintType, type); //appendIndexOutputVkFFT(sc, uintType, type); /*uint64_t appendedRadix[10] = { 0,0,0,0,0,0,0,0,0,0 }; for (uint64_t i = 0; i < sc->numStages; i++) { if (appendedRadix[sc->stageRadix[i]] == 0) { appendedRadix[sc->stageRadix[i]] = 1; appendRadixKernelVkFFT(sc, floatType, uintType, sc->stageRadix[i]); } }*/ uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->axisSwapped)) ? 1 : type; #if(VKFFT_BACKEND==0) res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } sc->tempLen = sprintf(sc->tempStr, "void main() {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } #elif(VKFFT_BACKEND==1) sc->tempLen = sprintf(sc->tempStr, "extern __shared__ float shared[];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __global__ void __launch_bounds__(%" PRIu64 ") VkFFT_main ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } switch (type) { case 5: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput); break; } case 6: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory); break; } case 110: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 111: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 120: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 121: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 130: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 131: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 140: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 141: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 142: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 143: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 144: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 145: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } default: { sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput); break; } } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->raderUintLUT) { sc->tempLen = sprintf(sc->tempStr, ", %s* g_pow", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->BluesteinConvolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinConvolutionKernel", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinMultiplication", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } #elif(VKFFT_BACKEND==2) sc->tempLen = sprintf(sc->tempStr, "extern __shared__ float shared[];\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (!sc->useUint64 && sc->useStrict32BitAddress > 0) { // These wrappers help hipcc to generate faster code for load and store operations where // 64-bit scalar + 32-bit vector registers are used instead of 64-bit vector saving a few // instructions for computing 64-bit vector addresses. sc->tempLen = sprintf(sc->tempStr, "template\n" "struct Inputs\n" "{\n" " const T* buffer;\n" " inline __device__ Inputs(const T* buffer) : buffer(buffer) {}\n" " inline __device__ const T& operator[](unsigned int idx) const { return *reinterpret_cast(reinterpret_cast(buffer) + idx * static_cast(sizeof(T))); }\n" "};\n" "template\n" "struct Outputs\n" "{\n" " T* buffer;\n" " inline __device__ Outputs(T* buffer) : buffer(buffer) {}\n" " inline __device__ T& operator[](unsigned int idx) const { return *reinterpret_cast(reinterpret_cast(buffer) + idx * static_cast(sizeof(T))); }\n" "};\n" ); } else { sc->tempLen = sprintf(sc->tempStr, "template\n" "using Inputs = const T*;\n" "template\n" "using Outputs = T*;\n" ); } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __launch_bounds__(%" PRIu64 ") __global__ void VkFFT_main ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } switch (type) { case 5: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, vecTypeOutput); break; } case 6: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInput, floatTypeOutputMemory); break; } case 110: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 111: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 120: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 121: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 130: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 131: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 140: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 141: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 142: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 143: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 144: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 145: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } default: { sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInput, vecTypeOutput); break; } } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> kernel_obj", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> twiddleLUT", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->raderUintLUT) { sc->tempLen = sprintf(sc->tempStr, ", %s* g_pow", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->BluesteinConvolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinConvolutionKernel", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinMultiplication", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } #elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) sc->tempLen = sprintf(sc->tempStr, "__kernel __attribute__((reqd_work_group_size(%" PRIu64 ", %" PRIu64 ", %" PRIu64 "))) void VkFFT_main ", sc->localSize[0], sc->localSize[1], sc->localSize[2]); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } switch (type) { case 5: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, vecTypeOutput); break; } case 6: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, floatTypeOutputMemory); break; } case 110: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 111: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 120: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 121: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 130: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 131: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 140: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 141: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 142: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 143: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 144: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } case 145: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); break; } default: { sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput); break; } } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } int args_id = 2; if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->raderUintLUT) { sc->tempLen = sprintf(sc->tempStr, ", __global %s* g_pow", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->BluesteinConvolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinConvolutionKernel", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinMultiplication", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->pushConstantsStructSize > 0) { sc->tempLen = sprintf(sc->tempStr, ", PushConsts consts"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } #elif(VKFFT_BACKEND==5) sc->tempLen = sprintf(sc->tempStr, "kernel void VkFFT_main "); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "(%s3 thread_position_in_grid [[thread_position_in_grid]], ", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], ", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "%s3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], ", uintType_32); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; sc->tempLen = sprintf(sc->tempStr, "threadgroup %s* sdata [[threadgroup(0)]], ", vecType); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } switch (type) { case 5: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, vecTypeOutput); break; } case 6: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInput, floatTypeOutputMemory); break; } case 110: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 111: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 120: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 121: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 130: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 131: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 140: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 141: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 142: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 143: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 144: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } case 145: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", floatTypeInputMemory, floatTypeOutputMemory); break; } default: { sc->tempLen = sprintf(sc->tempStr, "device %s* inputs[[buffer(0)]], device %s* outputs[[buffer(1)]]", vecTypeInput, vecTypeOutput); break; } } res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } int args_id = 2; if (sc->convolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", constant %s* kernel_obj[[buffer(%d)]]", vecType, args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->LUT) { sc->tempLen = sprintf(sc->tempStr, ", constant %s* twiddleLUT[[buffer(%d)]]", vecType, args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->raderUintLUT) { sc->tempLen = sprintf(sc->tempStr, ", constant %s* g_pow[[buffer(%d)]]", uintType_32, args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->BluesteinConvolutionStep) { sc->tempLen = sprintf(sc->tempStr, ", constant %s* BluesteinConvolutionKernel[[buffer(%d)]]", vecType, args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { sc->tempLen = sprintf(sc->tempStr, ", constant %s* BluesteinMultiplication[[buffer(%d)]]", vecType, args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } args_id++; } if (sc->pushConstantsStructSize > 0) { sc->tempLen = sprintf(sc->tempStr, ", constant PushConsts& consts[[buffer(%d)]]", args_id); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; args_id++; } sc->tempLen = sprintf(sc->tempStr, ") {\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) return res; //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } #endif //if (type==0) sc->tempLen = sprintf(sc->tempStr, "return;\n"); res = appendInitialization(sc, floatType, uintType, type); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } res = setReadToRegisters(sc, type); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } res = setWriteFromRegisters(sc, type); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) { sc->tempLen = sprintf(sc->tempStr, " for (%s coordinate=%" PRIu64 "; coordinate > 0; coordinate--){\n\ coordinate--;\n", uintType, sc->matrixConvolution); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } res = appendReadDataVkFFT(sc, floatType, floatTypeInputMemory, uintType, type); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (sc->useBluesteinFFT && sc->BluesteinPreMultiplication) { res = appendBluesteinMultiplication(sc, floatType, uintType, locType, 0); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } //appendBarrierVkFFT(sc, 1); res = appendReorder4StepRead(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (!sc->useRader) { res = appendBoostThreadDataReorder(sc, floatType, uintType, locType, 1); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } uint64_t stageSize = 1; uint64_t stageSizeSum = 0; long double double_PI = 3.14159265358979323846264338327950288419716939937510L; long double stageAngle = (sc->inverse) ? double_PI : -double_PI; for (uint64_t i = 0; i < sc->numStages; i++) { if ((i == sc->numStages - 1) && (sc->registerBoost > 1)) { res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], i, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } res = appendRegisterBoostShuffle(sc, floatType, stageSize, sc->stageRadix[i - 1], sc->stageRadix[i], stageAngle); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } else { res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], i, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (i > 0) { switch (sc->stageRadix[i]) { case 2: stageSizeSum += stageSize; break; case 3: stageSizeSum += stageSize * 2; break; case 4: stageSizeSum += stageSize * 2; break; case 5: stageSizeSum += stageSize * 4; break; case 6: stageSizeSum += stageSize * 5; break; case 7: stageSizeSum += stageSize * 6; break; case 8: stageSizeSum += stageSize * 3; break; case 9: stageSizeSum += stageSize * 8; break; case 10: stageSizeSum += stageSize * 9; break; case 11: stageSizeSum += stageSize * 10; break; case 12: stageSizeSum += stageSize * 11; break; case 13: stageSizeSum += stageSize * 12; break; case 14: stageSizeSum += stageSize * 13; break; case 15: stageSizeSum += stageSize * 14; break; case 16: stageSizeSum += stageSize * 4; break; case 32: stageSizeSum += stageSize * 5; break; default: stageSizeSum += stageSize * (sc->stageRadix[i]); break; } } if ((i == sc->numStages - 1) || (sc->registerBoost == 1)) { res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], i, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } else { res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i + 1], i, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } stageSize *= sc->stageRadix[i]; stageAngle /= sc->stageRadix[i]; } } if ((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) { res = appendCoordinateRegisterStore(sc, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (sc->matrixConvolution > 1) { sc->tempLen = sprintf(sc->tempStr, " coordinate++;}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->numKernels > 1) { res = appendPreparationBatchedKernelConvolution(sc, floatType, floatTypeKernelMemory, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if (sc->useBluesteinFFT && sc->BluesteinConvolutionStep) { res = appendBluesteinConvolution(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } else { res = appendKernelConvolution(sc, floatType, floatTypeKernelMemory, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } appendBarrierVkFFT(sc, 1); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (sc->matrixConvolution > 1) { sc->tempLen = sprintf(sc->tempStr, " for (%s coordinate=0; coordinate < %" PRIu64 "; coordinate++){\n", uintType, sc->matrixConvolution); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } res = appendCoordinateRegisterPull(sc, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } stageSize = 1; stageSizeSum = 0; stageAngle = double_PI; sc->inverse = 1; for (uint64_t i = 0; i < sc->numStages; i++) { res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], i, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (i > 0) { switch (sc->stageRadix[i]) { case 2: stageSizeSum += stageSize; break; case 3: stageSizeSum += stageSize * 2; break; case 4: stageSizeSum += stageSize * 2; break; case 5: stageSizeSum += stageSize * 4; break; case 6: stageSizeSum += stageSize * 5; break; case 7: stageSizeSum += stageSize * 6; break; case 8: stageSizeSum += stageSize * 3; break; case 9: stageSizeSum += stageSize * 8; break; case 10: stageSizeSum += stageSize * 9; break; case 11: stageSizeSum += stageSize * 10; break; case 12: stageSizeSum += stageSize * 11; break; case 13: stageSizeSum += stageSize * 12; break; case 14: stageSizeSum += stageSize * 13; break; case 15: stageSizeSum += stageSize * 14; break; case 16: stageSizeSum += stageSize * 4; break; case 32: stageSizeSum += stageSize * 5; break; default: stageSizeSum += stageSize * (sc->stageRadix[i]); break; } } if (i == sc->numStages - 1) { res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], i, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } else { res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i + 1], i, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } stageSize *= sc->stageRadix[i]; stageAngle /= sc->stageRadix[i]; } } if (!sc->useRader) { //if (((sc->stageRadix[sc->numStages - 1] < sc->fixMinRaderPrimeMult) || (sc->rader_generator[sc->numStages - 1] == 0))) { res = appendBoostThreadDataReorder(sc, floatType, uintType, locType, 0); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } res = appendReorder4StepWrite(sc, floatType, uintType, locType); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if (sc->useBluesteinFFT && sc->BluesteinPostMultiplication) { res = appendBluesteinMultiplication(sc, floatType, uintType, locType, 1); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } res = appendWriteDataVkFFT(sc, floatType, floatTypeOutputMemory, uintType, type); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } if ((sc->convolutionStep) && (sc->numKernels > 1)) { sc->tempLen = sprintf(sc->tempStr, " }\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } } sc->tempLen = sprintf(sc->tempStr, "}\n"); res = VkAppendLine(sc); if (res != VKFFT_SUCCESS) { freeShaderGenVkFFT(sc); return res; } freeShaderGenVkFFT(sc); //if (sc->useBluesteinFFT) //printf("%s", output); return res; } #if(VKFFT_BACKEND==0) static inline VkFFTResult findMemoryType(VkFFTApplication* app, uint64_t memoryTypeBits, uint64_t memorySize, VkMemoryPropertyFlags properties, uint32_t* memoryTypeIndex) { VkPhysicalDeviceMemoryProperties memoryProperties = { 0 }; vkGetPhysicalDeviceMemoryProperties(app->configuration.physicalDevice[0], &memoryProperties); for (uint64_t i = 0; i < memoryProperties.memoryTypeCount; ++i) { if ((memoryTypeBits & ((uint64_t)1 << i)) && ((memoryProperties.memoryTypes[i].propertyFlags & properties) == properties) && (memoryProperties.memoryHeaps[memoryProperties.memoryTypes[i].heapIndex].size >= memorySize)) { memoryTypeIndex[0] = (uint32_t)i; return VKFFT_SUCCESS; } } return VKFFT_ERROR_FAILED_TO_FIND_MEMORY; } static inline VkFFTResult allocateFFTBuffer(VkFFTApplication* app, VkBuffer* buffer, VkDeviceMemory* deviceMemory, VkBufferUsageFlags usageFlags, VkMemoryPropertyFlags propertyFlags, VkDeviceSize size) { VkFFTResult resFFT = VKFFT_SUCCESS; VkResult res = VK_SUCCESS; uint32_t queueFamilyIndices; VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; bufferCreateInfo.queueFamilyIndexCount = 1; bufferCreateInfo.pQueueFamilyIndices = &queueFamilyIndices; bufferCreateInfo.size = size; bufferCreateInfo.usage = usageFlags; res = vkCreateBuffer(app->configuration.device[0], &bufferCreateInfo, 0, buffer); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_BUFFER; VkMemoryRequirements memoryRequirements = { 0 }; vkGetBufferMemoryRequirements(app->configuration.device[0], buffer[0], &memoryRequirements); VkMemoryAllocateInfo memoryAllocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; memoryAllocateInfo.allocationSize = memoryRequirements.size; resFFT = findMemoryType(app, memoryRequirements.memoryTypeBits, memoryRequirements.size, propertyFlags, &memoryAllocateInfo.memoryTypeIndex); if (resFFT != VKFFT_SUCCESS) return resFFT; res = vkAllocateMemory(app->configuration.device[0], &memoryAllocateInfo, 0, deviceMemory); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY; res = vkBindBufferMemory(app->configuration.device[0], buffer[0], deviceMemory[0], 0); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY; return resFFT; } #endif static inline VkFFTResult VkFFT_transferDataFromCPU(VkFFTApplication* app, void* cpu_arr, void* input_buffer, uint64_t transferSize) { VkFFTResult resFFT = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) VkBuffer* buffer = (VkBuffer*)input_buffer; VkDeviceSize bufferSize = transferSize; VkResult res = VK_SUCCESS; VkDeviceSize stagingBufferSize = bufferSize; VkBuffer stagingBuffer = { 0 }; VkDeviceMemory stagingBufferMemory = { 0 }; resFFT = allocateFFTBuffer(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; void* data; res = vkMapMemory(app->configuration.device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_MAP_MEMORY; memcpy(data, cpu_arr, stagingBufferSize); vkUnmapMemory(app->configuration.device[0], stagingBufferMemory); VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; commandBufferAllocateInfo.commandPool = app->configuration.commandPool[0]; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkCommandBuffer commandBuffer = { 0 }; res = vkAllocateCommandBuffers(app->configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; VkBufferCopy copyRegion = { 0 }; copyRegion.srcOffset = 0; copyRegion.dstOffset = 0; copyRegion.size = stagingBufferSize; vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, ©Region); res = vkEndCommandBuffer(commandBuffer); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &commandBuffer; res = vkQueueSubmit(app->configuration.queue[0], 1, &submitInfo, app->configuration.fence[0]); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; res = vkWaitForFences(app->configuration.device[0], 1, app->configuration.fence, VK_TRUE, 100000000000); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; res = vkResetFences(app->configuration.device[0], 1, app->configuration.fence); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES; vkFreeCommandBuffers(app->configuration.device[0], app->configuration.commandPool[0], 1, &commandBuffer); vkDestroyBuffer(app->configuration.device[0], stagingBuffer, 0); vkFreeMemory(app->configuration.device[0], stagingBufferMemory, 0); return resFFT; #elif(VKFFT_BACKEND==1) cudaError_t res = cudaSuccess; void* buffer = ((void**)input_buffer)[0]; res = cudaMemcpy(buffer, cpu_arr, transferSize, cudaMemcpyHostToDevice); if (res != cudaSuccess) { return VKFFT_ERROR_FAILED_TO_COPY; } #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; void* buffer = ((void**)input_buffer)[0]; res = hipMemcpy(buffer, cpu_arr, transferSize, hipMemcpyHostToDevice); if (res != hipSuccess) { return VKFFT_ERROR_FAILED_TO_COPY; } #elif(VKFFT_BACKEND==3) cl_int res = CL_SUCCESS; cl_mem* buffer = (cl_mem*)input_buffer; cl_command_queue commandQueue = clCreateCommandQueue(app->configuration.context[0], app->configuration.device[0], 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE; res = clEnqueueWriteBuffer(commandQueue, buffer[0], CL_TRUE, 0, transferSize, cpu_arr, 0, NULL, NULL); if (res != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_COPY; } res = clReleaseCommandQueue(commandQueue); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE; #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; void* buffer = ((void**)input_buffer)[0]; ze_command_queue_desc_t commandQueueCopyDesc = { ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, 0, app->configuration.commandQueueID, 0, // index 0, // flags ZE_COMMAND_QUEUE_MODE_DEFAULT, ZE_COMMAND_QUEUE_PRIORITY_NORMAL }; ze_command_list_handle_t copyCommandList; res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); if (res != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; } res = zeCommandListAppendMemoryCopy(copyCommandList, buffer, cpu_arr, transferSize, 0, 0, 0); if (res != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_COPY; } res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); if (res != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==5) MTL::Buffer* stagingBuffer = app->configuration.device->newBuffer(cpu_arr, transferSize, MTL::ResourceStorageModeShared); MTL::CommandBuffer* copyCommandBuffer = app->configuration.queue->commandBuffer(); if (copyCommandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::BlitCommandEncoder* blitCommandEncoder = copyCommandBuffer->blitCommandEncoder(); if (blitCommandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::Buffer* buffer = ((MTL::Buffer**)input_buffer)[0]; blitCommandEncoder->copyFromBuffer((MTL::Buffer*)stagingBuffer, 0, (MTL::Buffer*)buffer, 0, transferSize); blitCommandEncoder->endEncoding(); copyCommandBuffer->commit(); copyCommandBuffer->waitUntilCompleted(); blitCommandEncoder->release(); copyCommandBuffer->release(); stagingBuffer->release(); #endif return resFFT; } static inline VkFFTResult VkFFT_transferDataToCPU(VkFFTApplication* app, void* cpu_arr, void* output_buffer, uint64_t transferSize) { VkFFTResult resFFT = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) VkBuffer* buffer = (VkBuffer*)output_buffer; VkDeviceSize bufferSize = transferSize; VkResult res = VK_SUCCESS; uint64_t stagingBufferSize = bufferSize; VkBuffer stagingBuffer = { 0 }; VkDeviceMemory stagingBufferMemory = { 0 }; resFFT = allocateFFTBuffer(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; commandBufferAllocateInfo.commandPool = app->configuration.commandPool[0]; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkCommandBuffer commandBuffer = { 0 }; res = vkAllocateCommandBuffers(app->configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; VkBufferCopy copyRegion = { 0 }; copyRegion.srcOffset = 0; copyRegion.dstOffset = 0; copyRegion.size = stagingBufferSize; vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer, 1, ©Region); res = vkEndCommandBuffer(commandBuffer); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &commandBuffer; res = vkQueueSubmit(app->configuration.queue[0], 1, &submitInfo, app->configuration.fence[0]); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; res = vkWaitForFences(app->configuration.device[0], 1, app->configuration.fence, VK_TRUE, 100000000000); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; res = vkResetFences(app->configuration.device[0], 1, app->configuration.fence); if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES; vkFreeCommandBuffers(app->configuration.device[0], app->configuration.commandPool[0], 1, &commandBuffer); void* data; res = vkMapMemory(app->configuration.device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data); if (resFFT != VKFFT_SUCCESS) return resFFT; memcpy(cpu_arr, data, stagingBufferSize); vkUnmapMemory(app->configuration.device[0], stagingBufferMemory); vkDestroyBuffer(app->configuration.device[0], stagingBuffer, 0); vkFreeMemory(app->configuration.device[0], stagingBufferMemory, 0); #elif(VKFFT_BACKEND==1) cudaError_t res = cudaSuccess; void* buffer = ((void**)output_buffer)[0]; res = cudaMemcpy(cpu_arr, buffer, transferSize, cudaMemcpyDeviceToHost); if (res != cudaSuccess) { return VKFFT_ERROR_FAILED_TO_COPY; } #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; void* buffer = ((void**)output_buffer)[0]; res = hipMemcpy(cpu_arr, buffer, transferSize, hipMemcpyDeviceToHost); if (res != hipSuccess) { return VKFFT_ERROR_FAILED_TO_COPY; } #elif(VKFFT_BACKEND==3) cl_int res = CL_SUCCESS; cl_mem* buffer = (cl_mem*)output_buffer; cl_command_queue commandQueue = clCreateCommandQueue(app->configuration.context[0], app->configuration.device[0], 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE; res = clEnqueueReadBuffer(commandQueue, buffer[0], CL_TRUE, 0, transferSize, cpu_arr, 0, NULL, NULL); if (res != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_COPY; } res = clReleaseCommandQueue(commandQueue); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE; #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; void* buffer = ((void**)output_buffer)[0]; ze_command_queue_desc_t commandQueueCopyDesc = { ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, 0, app->configuration.commandQueueID, 0, // index 0, // flags ZE_COMMAND_QUEUE_MODE_DEFAULT, ZE_COMMAND_QUEUE_PRIORITY_NORMAL }; ze_command_list_handle_t copyCommandList; res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); if (res != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; } res = zeCommandListAppendMemoryCopy(copyCommandList, cpu_arr, buffer, transferSize, 0, 0, 0); if (res != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_COPY; } res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); if (res != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==5) MTL::Buffer* stagingBuffer = app->configuration.device->newBuffer(transferSize, MTL::ResourceStorageModeShared); MTL::CommandBuffer* copyCommandBuffer = app->configuration.queue->commandBuffer(); if (copyCommandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::BlitCommandEncoder* blitCommandEncoder = copyCommandBuffer->blitCommandEncoder(); if (blitCommandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::Buffer* buffer = ((MTL::Buffer**)output_buffer)[0]; blitCommandEncoder->copyFromBuffer((MTL::Buffer*)buffer, 0, (MTL::Buffer*)stagingBuffer, 0, transferSize); blitCommandEncoder->endEncoding(); copyCommandBuffer->commit(); copyCommandBuffer->waitUntilCompleted(); blitCommandEncoder->release(); copyCommandBuffer->release(); memcpy(cpu_arr, stagingBuffer->contents(), transferSize); stagingBuffer->release(); #endif return resFFT; } static inline void deleteAxis(VkFFTApplication* app, VkFFTAxis* axis) { if (axis->specializationConstants.numRaderPrimes) { free(axis->specializationConstants.raderContainer); axis->specializationConstants.raderContainer = 0; axis->specializationConstants.numRaderPrimes = 0; } #if(VKFFT_BACKEND==0) if ((app->configuration.useLUT == 1) && (!axis->referenceLUT)) { if (axis->bufferLUT != 0) { vkDestroyBuffer(app->configuration.device[0], axis->bufferLUT, 0); axis->bufferLUT = 0; } if (axis->bufferLUTDeviceMemory != 0) { vkFreeMemory(app->configuration.device[0], axis->bufferLUTDeviceMemory, 0); axis->bufferLUTDeviceMemory = 0; } } if (axis->descriptorPool != 0) { vkDestroyDescriptorPool(app->configuration.device[0], axis->descriptorPool, 0); axis->descriptorPool = 0; } if (axis->descriptorSetLayout != 0) { vkDestroyDescriptorSetLayout(app->configuration.device[0], axis->descriptorSetLayout, 0); axis->descriptorSetLayout = 0; } if (axis->pipelineLayout != 0) { vkDestroyPipelineLayout(app->configuration.device[0], axis->pipelineLayout, 0); axis->pipelineLayout = 0; } if (axis->pipeline != 0) { vkDestroyPipeline(app->configuration.device[0], axis->pipeline, 0); axis->pipeline = 0; } #elif(VKFFT_BACKEND==1) CUresult res = CUDA_SUCCESS; cudaError_t res_t = cudaSuccess; if ((app->configuration.useLUT == 1) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { res_t = cudaFree(axis->bufferLUT); if (res_t == cudaSuccess) axis->bufferLUT = 0; } if (axis->VkFFTModule != 0) { res = cuModuleUnload(axis->VkFFTModule); if (res == CUDA_SUCCESS) axis->VkFFTModule = 0; } #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; if ((app->configuration.useLUT == 1) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { res = hipFree(axis->bufferLUT); if (res == hipSuccess) axis->bufferLUT = 0; } if (axis->VkFFTModule != 0) { res = hipModuleUnload(axis->VkFFTModule); if (res == hipSuccess) axis->VkFFTModule = 0; } #elif(VKFFT_BACKEND==3) cl_int res = 0; if ((app->configuration.useLUT == 1) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { res = clReleaseMemObject(axis->bufferLUT); if (res == 0) axis->bufferLUT = 0; } if (axis->program != 0) { res = clReleaseProgram(axis->program); if (res == 0) axis->program = 0; } if (axis->kernel != 0) { res = clReleaseKernel(axis->kernel); if (res == 0) axis->kernel = 0; } #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; if ((app->configuration.useLUT == 1) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { res = zeMemFree(app->configuration.context[0], axis->bufferLUT); if (res == ZE_RESULT_SUCCESS) axis->bufferLUT = 0; } if (axis->VkFFTModule != 0) { res = zeModuleDestroy(axis->VkFFTModule); if (res == ZE_RESULT_SUCCESS)axis->VkFFTModule = 0; } if (axis->VkFFTKernel != 0) { res = zeKernelDestroy(axis->VkFFTKernel); if (res == ZE_RESULT_SUCCESS)axis->VkFFTKernel = 0; } #elif(VKFFT_BACKEND==5) if (axis->pushConstants.dataUintBuffer) { axis->pushConstants.dataUintBuffer->release(); axis->pushConstants.dataUintBuffer = 0; } if ((app->configuration.useLUT == 1) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { ((MTL::Buffer*)axis->bufferLUT)->release(); //free(axis->bufferLUT); axis->bufferLUT = 0; } if (axis->pipeline != 0) { axis->pipeline->release(); //free(axis->pipeline); axis->pipeline = 0; } if (axis->library != 0) { axis->library->release(); //free(axis->library); axis->library = 0; } #endif if (app->configuration.saveApplicationToString) { if (axis->binary != 0) { free(axis->binary); axis->binary = 0; } } } static inline void deleteVkFFT(VkFFTApplication* app) { #if(VKFFT_BACKEND==0) if (app->configuration.isCompilerInitialized) { glslang_finalize_process(); app->configuration.isCompilerInitialized = 0; } #elif(VKFFT_BACKEND==1) if (app->configuration.num_streams > 1) { cudaError_t res_t = cudaSuccess; for (uint64_t i = 0; i < app->configuration.num_streams; i++) { if (app->configuration.stream_event[i] != 0) { res_t = cudaEventDestroy(app->configuration.stream_event[i]); if (res_t == cudaSuccess) app->configuration.stream_event[i] = 0; } } if (app->configuration.stream_event != 0) { free(app->configuration.stream_event); app->configuration.stream_event = 0; } } #elif(VKFFT_BACKEND==2) if (app->configuration.num_streams > 1) { hipError_t res_t = hipSuccess; for (uint64_t i = 0; i < app->configuration.num_streams; i++) { if (app->configuration.stream_event[i] != 0) { res_t = hipEventDestroy(app->configuration.stream_event[i]); if (res_t == hipSuccess) app->configuration.stream_event[i] = 0; } } if (app->configuration.stream_event != 0) { free(app->configuration.stream_event); app->configuration.stream_event = 0; } } #endif if (app->numRaderFFTPrimes) { for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { free(app->raderFFTkernel[i]); app->raderFFTkernel[i] = 0; } } if (!app->configuration.userTempBuffer) { if (app->configuration.allocateTempBuffer) { app->configuration.allocateTempBuffer = 0; #if(VKFFT_BACKEND==0) if (app->configuration.tempBuffer[0] != 0) { vkDestroyBuffer(app->configuration.device[0], app->configuration.tempBuffer[0], 0); app->configuration.tempBuffer[0] = 0; } if (app->configuration.tempBufferDeviceMemory != 0) { vkFreeMemory(app->configuration.device[0], app->configuration.tempBufferDeviceMemory, 0); app->configuration.tempBufferDeviceMemory = 0; } #elif(VKFFT_BACKEND==1) cudaError_t res_t = cudaSuccess; if (app->configuration.tempBuffer[0] != 0) { res_t = cudaFree(app->configuration.tempBuffer[0]); if (res_t == cudaSuccess) app->configuration.tempBuffer[0] = 0; } #elif(VKFFT_BACKEND==2) hipError_t res_t = hipSuccess; if (app->configuration.tempBuffer[0] != 0) { res_t = hipFree(app->configuration.tempBuffer[0]); if (res_t == hipSuccess) app->configuration.tempBuffer[0] = 0; } #elif(VKFFT_BACKEND==3) cl_int res = 0; if (app->configuration.tempBuffer[0] != 0) { res = clReleaseMemObject(app->configuration.tempBuffer[0]); if (res == 0) app->configuration.tempBuffer[0] = 0; } #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; if (app->configuration.tempBuffer[0] != 0) { res = zeMemFree(app->configuration.context[0], app->configuration.tempBuffer[0]); if (res == ZE_RESULT_SUCCESS) app->configuration.tempBuffer[0] = 0; } #elif(VKFFT_BACKEND==5) if (app->configuration.tempBuffer[0] != 0) { ((MTL::Buffer*)app->configuration.tempBuffer[0])->release(); } #endif if (app->configuration.tempBuffer != 0) { free(app->configuration.tempBuffer); app->configuration.tempBuffer = 0; } } if (app->configuration.tempBufferSize != 0) { free(app->configuration.tempBufferSize); app->configuration.tempBufferSize = 0; } } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->configuration.useRaderUintLUT) { for (uint64_t j = 0; j < 4; j++) { if (app->bufferRaderUintLUT[i][j]) { #if(VKFFT_BACKEND==0) vkDestroyBuffer(app->configuration.device[0], app->bufferRaderUintLUT[i][j], 0); app->bufferRaderUintLUT[i][j] = 0; vkFreeMemory(app->configuration.device[0], app->bufferRaderUintLUTDeviceMemory[i][j], 0); app->bufferRaderUintLUTDeviceMemory[i][j] = 0; #elif(VKFFT_BACKEND==1) cudaError_t res_t = cudaSuccess; res_t = cudaFree(app->bufferRaderUintLUT[i][j]); if (res_t == cudaSuccess) app->bufferRaderUintLUT[i][j] = 0; #elif(VKFFT_BACKEND==2) hipError_t res_t = hipSuccess; res_t = hipFree(app->bufferRaderUintLUT[i][j]); if (res_t == hipSuccess) app->bufferRaderUintLUT[i][j] = 0; #elif(VKFFT_BACKEND==3) cl_int res = 0; res = clReleaseMemObject(app->bufferRaderUintLUT[i][j]); if (res == 0) app->bufferRaderUintLUT[i][j] = 0; #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; res = zeMemFree(app->configuration.context[0], app->bufferRaderUintLUT[i][j]); if (res == ZE_RESULT_SUCCESS) app->bufferRaderUintLUT[i][j] = 0; #elif(VKFFT_BACKEND==5) if (app->bufferRaderUintLUT[i][j] != 0) { ((MTL::Buffer*)app->bufferRaderUintLUT[i][j])->release(); //free(app->bufferRaderUintLUT[i][j]); app->bufferRaderUintLUT[i][j] = 0; } #endif } } } if (app->useBluesteinFFT[i]) { #if(VKFFT_BACKEND==0) if (app->bufferBluestein[i] != 0) { vkDestroyBuffer(app->configuration.device[0], app->bufferBluestein[i], 0); app->bufferBluestein[i] = 0; } if (app->bufferBluesteinDeviceMemory[i] != 0) { vkFreeMemory(app->configuration.device[0], app->bufferBluesteinDeviceMemory[i], 0); app->bufferBluesteinDeviceMemory[i] = 0; } if (app->bufferBluesteinFFT[i] != 0) { vkDestroyBuffer(app->configuration.device[0], app->bufferBluesteinFFT[i], 0); app->bufferBluesteinFFT[i] = 0; } if (app->bufferBluesteinFFTDeviceMemory[i] != 0) { vkFreeMemory(app->configuration.device[0], app->bufferBluesteinFFTDeviceMemory[i], 0); app->bufferBluesteinFFTDeviceMemory[i] = 0; } if (app->bufferBluesteinIFFT[i] != 0) { vkDestroyBuffer(app->configuration.device[0], app->bufferBluesteinIFFT[i], 0); app->bufferBluesteinIFFT[i] = 0; } if (app->bufferBluesteinIFFTDeviceMemory[i] != 0) { vkFreeMemory(app->configuration.device[0], app->bufferBluesteinIFFTDeviceMemory[i], 0); app->bufferBluesteinIFFTDeviceMemory[i] = 0; } #elif(VKFFT_BACKEND==1) cudaError_t res_t = cudaSuccess; if (app->bufferBluestein[i] != 0) { res_t = cudaFree(app->bufferBluestein[i]); if (res_t == cudaSuccess) app->bufferBluestein[i] = 0; } if (app->bufferBluesteinFFT[i] != 0) { res_t = cudaFree(app->bufferBluesteinFFT[i]); if (res_t == cudaSuccess) app->bufferBluesteinFFT[i] = 0; } if (app->bufferBluesteinIFFT[i] != 0) { res_t = cudaFree(app->bufferBluesteinIFFT[i]); if (res_t == cudaSuccess) app->bufferBluesteinIFFT[i] = 0; } #elif(VKFFT_BACKEND==2) hipError_t res_t = hipSuccess; if (app->bufferBluestein[i] != 0) { res_t = hipFree(app->bufferBluestein[i]); if (res_t == hipSuccess) app->bufferBluestein[i] = 0; } if (app->bufferBluesteinFFT[i] != 0) { res_t = hipFree(app->bufferBluesteinFFT[i]); if (res_t == hipSuccess) app->bufferBluesteinFFT[i] = 0; } if (app->bufferBluesteinIFFT[i] != 0) { res_t = hipFree(app->bufferBluesteinIFFT[i]); if (res_t == hipSuccess) app->bufferBluesteinIFFT[i] = 0; } #elif(VKFFT_BACKEND==3) cl_int res = 0; if (app->bufferBluestein[i] != 0) { res = clReleaseMemObject(app->bufferBluestein[i]); if (res == 0) app->bufferBluestein[i] = 0; } if (app->bufferBluesteinFFT[i] != 0) { res = clReleaseMemObject(app->bufferBluesteinFFT[i]); if (res == 0) app->bufferBluesteinFFT[i] = 0; } if (app->bufferBluesteinIFFT[i] != 0) { res = clReleaseMemObject(app->bufferBluesteinIFFT[i]); if (res == 0) app->bufferBluesteinIFFT[i] = 0; } #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; if (app->bufferBluestein[i] != 0) { res = zeMemFree(app->configuration.context[0], app->bufferBluestein[i]); if (res == ZE_RESULT_SUCCESS) app->bufferBluestein[i] = 0; } if (app->bufferBluesteinFFT[i] != 0) { res = zeMemFree(app->configuration.context[0], app->bufferBluesteinFFT[i]); if (res == ZE_RESULT_SUCCESS) app->bufferBluesteinFFT[i] = 0; } if (app->bufferBluesteinIFFT[i] != 0) { res = zeMemFree(app->configuration.context[0], app->bufferBluesteinIFFT[i]); if (res == ZE_RESULT_SUCCESS) app->bufferBluesteinIFFT[i] = 0; } #elif(VKFFT_BACKEND==5) if (app->bufferBluestein[i] != 0) { ((MTL::Buffer*)app->bufferBluestein[i])->release(); //free(app->bufferBluestein[i]); app->bufferBluestein[i] = 0; } if (app->bufferBluesteinFFT[i] != 0) { ((MTL::Buffer*)app->bufferBluesteinFFT[i])->release(); //free(app->bufferBluesteinFFT[i]); app->bufferBluesteinFFT[i] = 0; } if (app->bufferBluesteinIFFT[i] != 0) { ((MTL::Buffer*)app->bufferBluesteinIFFT[i])->release(); //free(app->bufferBluesteinIFFT[i]); app->bufferBluesteinIFFT[i] = 0; } #endif } } if (!app->configuration.makeInversePlanOnly) { if (app->localFFTPlan != 0) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->localFFTPlan->numAxisUploads[i] > 0) { for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) deleteAxis(app, &app->localFFTPlan->axes[i][j]); } } if (app->localFFTPlan->multiUploadR2C) { deleteAxis(app, &app->localFFTPlan->R2Cdecomposition); } if (app->localFFTPlan != 0) { free(app->localFFTPlan); app->localFFTPlan = 0; } } } if (!app->configuration.makeForwardPlanOnly) { if (app->localFFTPlan_inverse != 0) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->localFFTPlan_inverse->numAxisUploads[i] > 0) { for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) deleteAxis(app, &app->localFFTPlan_inverse->axes[i][j]); } } if (app->localFFTPlan_inverse->multiUploadR2C) { deleteAxis(app, &app->localFFTPlan_inverse->R2Cdecomposition); } if (app->localFFTPlan_inverse != 0) { free(app->localFFTPlan_inverse); app->localFFTPlan_inverse = 0; } } } if (app->configuration.saveApplicationToString) { if (app->saveApplicationString != 0) { free(app->saveApplicationString); app->saveApplicationString = 0; } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->applicationBluesteinString[i] != 0) { free(app->applicationBluesteinString[i]); app->applicationBluesteinString[i] = 0; } } } if (app->configuration.autoCustomBluesteinPaddingPattern) { if (app->configuration.primeSizes != 0) { free(app->configuration.primeSizes); app->configuration.primeSizes = 0; } if (app->configuration.paddedSizes != 0) { free(app->configuration.paddedSizes); app->configuration.paddedSizes = 0; } } } static inline VkFFTResult VkFFTGetRegistersPerThread(VkFFTApplication* app, uint64_t fft_length, uint64_t extraSharedMemoryForPow2, uint64_t max_rhs, uint64_t useRader, uint64_t* loc_multipliers, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread, uint64_t* isGoodSequence) { for (uint64_t i = 0; i < 33; i++) { registers_per_thread_per_radix[i] = 0; } registers_per_thread[0] = 0; min_registers_per_thread[0] = -1; if (loc_multipliers[2] > 0) { if (loc_multipliers[3] > 0) { if (loc_multipliers[5] > 0) { if (loc_multipliers[7] > 0) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 15; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; case 3: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 12; break; } registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 15; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; case 3: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 12; break; } registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 15; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; case 3: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 12; break; } registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 15; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; case 3: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 12; break; } registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 15; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; } registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 15; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; } registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 15; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; break; } registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 5; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 10; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 10; break; } registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } } else { if (loc_multipliers[7] > 0) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; } } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; } } } else { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; } } else { if (loc_multipliers[2] == loc_multipliers[3]) { registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 6; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; } } } } } } } else { if (loc_multipliers[5] > 0) { if (loc_multipliers[7] > 0) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; case 3: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; case 3: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; } } } else { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; case 3: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; } } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; } } } else { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 10; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; } } } } } else { if (loc_multipliers[7] > 0) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; case 3: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; case 3: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; } } } else { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; case 3: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 16; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; case 2: registers_per_thread_per_radix[2] = 14; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; break; } } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; break; } } else { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 10; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; break; } } } else { if (loc_multipliers[13] > 0) { switch (loc_multipliers[2]) { case 1: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; case 2: registers_per_thread_per_radix[2] = 12; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; default: registers_per_thread_per_radix[2] = 8; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; break; } } else { uint64_t max_loc_multipliers_pow2 = 0; uint64_t active_threads_y = max_rhs / 64; //estimate workbalance across CU (assume we have 64 CU) if (active_threads_y == 0) active_threads_y = 1; uint64_t testMinStages = -1; uint64_t maxRadixMinStages = 1; uint64_t fixMaxCheckRadix2 = 3; #if(VKFFT_BACKEND==1) fixMaxCheckRadix2 = (((fft_length >= 1024) || (fft_length == 256)) && (extraSharedMemoryForPow2) && (!useRader)) ? 5 : 3; #endif for (uint64_t i = 1; i <= fixMaxCheckRadix2; i++) { uint64_t numStages = (uint64_t)ceil(log2(fft_length) / ((double)i)); if (numStages < testMinStages) { testMinStages = numStages; maxRadixMinStages = i; } } for (uint64_t i = maxRadixMinStages; i >= 1; i--) { uint64_t active_threads_x = (active_threads_y * fft_length) / ((uint64_t)pow(2, i)); if (active_threads_x >= 128) { max_loc_multipliers_pow2 = i; i = 1; } } if (max_loc_multipliers_pow2 < 3) max_loc_multipliers_pow2 = 3; uint64_t final_loc_multipliers_pow2 = 1; uint64_t num_stages_min = (uint64_t)log2(fft_length); for (uint64_t i = 2; i <= max_loc_multipliers_pow2; i++) { uint64_t num_stages = (uint64_t)ceil(((uint64_t)log2(fft_length)) / (double)i); if (num_stages < num_stages_min) { final_loc_multipliers_pow2 = i; num_stages_min = num_stages; } } registers_per_thread_per_radix[2] = (loc_multipliers[2] > final_loc_multipliers_pow2) ? (uint64_t)pow(2, final_loc_multipliers_pow2) : (uint64_t)pow(2, loc_multipliers[2]); registers_per_thread_per_radix[2] = (loc_multipliers[2] < 3) ? (uint64_t)pow(2, loc_multipliers[2]) : registers_per_thread_per_radix[2]; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } } } } else { if (loc_multipliers[3] > 0) { if (loc_multipliers[5] > 0) { if (loc_multipliers[7] > 0) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 15; registers_per_thread_per_radix[5] = 15; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } } else { if (loc_multipliers[7] > 0) { if (loc_multipliers[3] == 1) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 14; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 6; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } } else { if (loc_multipliers[3] == 1) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 12; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 3; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 9; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } } } } else { if (loc_multipliers[5] > 0) { if (loc_multipliers[7] > 0) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 5; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } } else { if (loc_multipliers[7] > 0) { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 7; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 0; } } } else { if (loc_multipliers[11] > 0) { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 13; } else { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 11; registers_per_thread_per_radix[13] = 0; } } else { if (loc_multipliers[13] > 0) { registers_per_thread_per_radix[2] = 0; registers_per_thread_per_radix[3] = 0; registers_per_thread_per_radix[5] = 0; registers_per_thread_per_radix[7] = 0; registers_per_thread_per_radix[11] = 0; registers_per_thread_per_radix[13] = 13; } else { min_registers_per_thread[0] = 2; registers_per_thread[0] = 2; //Rader-only sequence //return VKFFT_ERROR_UNSUPPORTED_RADIX; } } } } } } registers_per_thread_per_radix[32] = ((registers_per_thread_per_radix[2] % 32) == 0) ? registers_per_thread_per_radix[2] : 0; registers_per_thread_per_radix[16] = ((registers_per_thread_per_radix[2] % 16) == 0) ? registers_per_thread_per_radix[2] : 0; registers_per_thread_per_radix[8] = ((registers_per_thread_per_radix[2] % 8) == 0) ? registers_per_thread_per_radix[2] : 0; registers_per_thread_per_radix[4] = ((registers_per_thread_per_radix[2] % 4) == 0) ? registers_per_thread_per_radix[2] : 0; if ((registers_per_thread_per_radix[2] >= 12) && (registers_per_thread_per_radix[3] >= 12)) { registers_per_thread_per_radix[12] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[3]) ? registers_per_thread_per_radix[3] : registers_per_thread_per_radix[2]; if ((registers_per_thread_per_radix[12] % 12) != 0) registers_per_thread_per_radix[12] = 0; } registers_per_thread_per_radix[6] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[3]) ? registers_per_thread_per_radix[3] : registers_per_thread_per_radix[2]; registers_per_thread_per_radix[9] = ((registers_per_thread_per_radix[3] % 9) == 0) ? registers_per_thread_per_radix[3] : 0; registers_per_thread_per_radix[10] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[5]) ? registers_per_thread_per_radix[5] : registers_per_thread_per_radix[2]; registers_per_thread_per_radix[14] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[7]) ? registers_per_thread_per_radix[7] : registers_per_thread_per_radix[2]; registers_per_thread_per_radix[15] = (registers_per_thread_per_radix[3] > registers_per_thread_per_radix[5]) ? registers_per_thread_per_radix[5] : registers_per_thread_per_radix[3]; for (uint64_t i = 0; i < 33; i++) { if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i]; if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i]; } if ((registers_per_thread[0] > 16) || (registers_per_thread[0] >= 2 * min_registers_per_thread[0])) isGoodSequence[0] = 0; else isGoodSequence[0] = 1; return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTGetRegistersPerThreadOptimizeShared(uint64_t fft_length, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread) { //try to split sequence in supported radix to optimize sm usage uint64_t numStages = 20; uint64_t fft_length_copy; uint64_t stages[20]; uint64_t k = 0; for (uint64_t i = 0; i < 33; i++) { registers_per_thread_per_radix[i] = 0; } registers_per_thread[0] = 0; min_registers_per_thread[0] = -1; for (uint64_t i = 1; i < numStages; i++) { fft_length_copy = fft_length; uint64_t min_comb_radix = (uint64_t)floor(pow(fft_length_copy, 1.0 / i)); if (min_comb_radix <= 16) { for (uint64_t j = 0; j < 20; j++) { stages[j] = 0; } k = 0; for (uint64_t j = min_comb_radix; j <= 16; j++) { if (k < i) { if ((fft_length_copy % j) == 0) { fft_length_copy /= j; min_comb_radix = (uint64_t)floor(pow(fft_length_copy, 1.0 / (i - k - 1))); stages[k] = j; j = min_comb_radix - 1; k++; } } } if ((fft_length_copy == 1) && (k == i)) break; } } for (uint64_t i = 0; i < k; i++) { for (uint64_t j = 2; j <= stages[i]; j++) { if ((stages[i] % j) == 0) { if (registers_per_thread_per_radix[j] < stages[i]) registers_per_thread_per_radix[j] = stages[i]; } } } for (uint64_t i = 0; i < 33; i++) { if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i]; } for (uint64_t i = 0; i < 33; i++) { if (registers_per_thread_per_radix[i] != 0) { double ratio = (registers_per_thread[0] / (double)registers_per_thread_per_radix[i]); uint64_t ratio_ceil = (uint64_t)ceil(ratio); uint64_t ratio_floor = (uint64_t)floor(ratio); double ratio2 = ((registers_per_thread_per_radix[i] * ratio_ceil) / (double)registers_per_thread[0]); double ratio3 = (registers_per_thread[0] / (double)(registers_per_thread_per_radix[i] * ratio_floor)); if (ratio2 > ratio3) registers_per_thread_per_radix[i] *= ratio_floor; else { registers_per_thread_per_radix[i] *= ratio_ceil; } } } registers_per_thread[0] = 0; for (uint64_t i = 0; i < 33; i++) { if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i]; if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i]; } return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTRaderContainer** raderContainer_input, uint64_t* tempSequence, uint64_t* numRaderPrimes, uint64_t fft_radix_part) { VkFFTResult res = VKFFT_SUCCESS; uint64_t locTempSequence = tempSequence[0]; uint64_t tempSequence_copy = tempSequence[0]; uint64_t limit = ((tempSequence[0] + 1) > app->configuration.fixMaxRaderPrimeFFT) ? app->configuration.fixMaxRaderPrimeFFT : (tempSequence[0] + 1); for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < limit; i++) { if (locTempSequence % i == 0) { numRaderPrimes[0]++; while (locTempSequence % i == 0) locTempSequence /= i; } } for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < app->configuration.fixMaxRaderPrimeMult; i++) { if (locTempSequence % i == 0) { numRaderPrimes[0]++; while (locTempSequence % i == 0) locTempSequence /= i; } } raderContainer_input[0] = (VkFFTRaderContainer*)calloc(sizeof(VkFFTRaderContainer), numRaderPrimes[0]); if (raderContainer_input[0] == 0) return VKFFT_ERROR_MALLOC_FAILED; VkFFTRaderContainer* raderContainer = raderContainer_input[0]; uint64_t tempSequence_temp = 1; limit = ((tempSequence[0] + 1) > app->configuration.fixMaxRaderPrimeFFT) ? app->configuration.fixMaxRaderPrimeFFT : (tempSequence[0] + 1); for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < limit; i++) { if (tempSequence[0] % i == 0) { if (i < app->configuration.fixMinRaderPrimeFFT) { tempSequence_temp *= i; tempSequence[0] /= i; i--; continue; } //Sophie Germain safe prime check uint64_t tempSequence2 = i - 1; for (uint64_t j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) { if (tempSequence2 % j == 0) { tempSequence2 /= j; j--; } } if (tempSequence2 != 1) { tempSequence_temp *= i; tempSequence[0] /= i; i--; continue; } tempSequence[0] /= i; for (uint64_t j = 0; j < numRaderPrimes[0]; j++) { if (raderContainer[j].prime == i) { raderContainer[j].multiplier++; j = numRaderPrimes[0]; } else if (raderContainer[j].prime == 0) { raderContainer[j].type = 0; raderContainer[j].prime = i; raderContainer[j].multiplier = 1; j = numRaderPrimes[0]; } } i--; } } tempSequence[0] *= tempSequence_temp; for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < app->configuration.fixMaxRaderPrimeMult; i++) { if (tempSequence[0] % i == 0) { tempSequence[0] /= i; for (uint64_t j = 0; j < numRaderPrimes[0]; j++) { if (raderContainer[j].prime == i) { raderContainer[j].multiplier++; j = numRaderPrimes[0]; } else if (raderContainer[j].prime == 0) { raderContainer[j].type = 1; raderContainer[j].prime = i; raderContainer[j].multiplier = 1; j = numRaderPrimes[0]; } } i--; } } //main loop for all primes for (uint64_t i = 0; i < numRaderPrimes[0]; i++) { //generator loop for (uint64_t r = 2; r < raderContainer[i].prime; r++) { uint64_t test = r; for (uint64_t iter = 0; iter < raderContainer[i].prime - 2; iter++) { if (test == 1) { test = 0; iter = raderContainer[i].prime; } test = ((test * r) % raderContainer[i].prime); } if (test == 1) { raderContainer[i].generator = r; r = raderContainer[i].prime; } } //subsplit and information initialization if (raderContainer[i].type) {//Multiplication raderContainer[i].registers_per_thread = 2; raderContainer[i].min_registers_per_thread = 2; } else {//FFT locTempSequence = raderContainer[i].prime - 1; raderContainer[i].containerFFTDim = raderContainer[i].prime - 1; raderContainer[i].containerFFTNum = fft_radix_part * tempSequence_copy / raderContainer[i].prime; uint64_t stageID = 0; for (uint64_t j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) { if (locTempSequence % j == 0) { locTempSequence /= j; raderContainer[i].loc_multipliers[j]++; //raderContainer[i].stageRadix[stageID] = j; //raderContainer[i].numThreadLaunches[stageID] = fft_radix_part * (tempSequence_copy / raderContainer[i].prime) * ((raderContainer[i].prime-1) / j); //stageID++; j--; } } //uint64_t isGoodSequence; //if (raderContainer[i].containerFFTNum<8) res = VkFFTGetRegistersPerThreadOptimizeShared(raderContainer[i].prime - 1, raderContainer[i].registers_per_thread_per_radix, &raderContainer[i].registers_per_thread, &raderContainer[i].min_registers_per_thread); //else //res = VkFFTGetRegistersPerThread(raderContainer[i].prime - 1, 0, 0, 1, raderContainer[i].loc_multipliers, raderContainer[i].registers_per_thread_per_radix, &raderContainer[i].registers_per_thread, &raderContainer[i].min_registers_per_thread, &isGoodSequence); if (res != VKFFT_SUCCESS) return res; if (locTempSequence != 1) { res = VkFFTConstructRaderTree(app, &raderContainer[i].container, &locTempSequence, &raderContainer[i].numSubPrimes, fft_radix_part * tempSequence_copy / raderContainer[i].prime); if (res != VKFFT_SUCCESS) return res; for (uint64_t j = 0; j < raderContainer[i].numSubPrimes; j++) { for (uint64_t t = 0; t < raderContainer[i].container[j].multiplier; t++) { raderContainer[i].stageRadix[stageID] = raderContainer[i].container[j].prime; stageID++; } } } raderContainer[i].numStages = stageID; } } return res; } static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t fftDim, uint64_t* min_registers_per_thread, uint64_t* registers_per_thread, uint64_t* registers_per_thread_per_radix) { VkFFTResult res = VKFFT_SUCCESS; for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { if (raderContainer[i].type == 0) { if (raderContainer[i].min_registers_per_thread / min_registers_per_thread[0] >= 2) { min_registers_per_thread[0] *= (raderContainer[i].min_registers_per_thread / min_registers_per_thread[0]); for (uint64_t j = 0; j < 33; j++) { if ((registers_per_thread_per_radix[j] > 0) && (registers_per_thread_per_radix[j] < min_registers_per_thread[0])) registers_per_thread_per_radix[j] *= (uint64_t)ceil(min_registers_per_thread[0] / (double)registers_per_thread_per_radix[j]); } for (uint64_t j = 0; j < 33; j++) { if (registers_per_thread_per_radix[j] > registers_per_thread[0]) registers_per_thread[0] = registers_per_thread_per_radix[j]; } } else if (min_registers_per_thread[0] / raderContainer[i].min_registers_per_thread >= 2) { raderContainer[i].min_registers_per_thread *= (min_registers_per_thread[0] / raderContainer[i].min_registers_per_thread); for (uint64_t j = 0; j < 33; j++) { if ((raderContainer[i].registers_per_thread_per_radix[j] > 0) && (raderContainer[i].registers_per_thread_per_radix[j] < raderContainer[i].min_registers_per_thread)) raderContainer[i].registers_per_thread_per_radix[j] *= (uint64_t)ceil(raderContainer[i].min_registers_per_thread / (double)raderContainer[i].registers_per_thread_per_radix[j]); } for (uint64_t j = 0; j < 33; j++) { if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; } } if (raderContainer[i].min_registers_per_thread < min_registers_per_thread[0]) { for (uint64_t j = 0; j < 33; j++) { if (raderContainer[i].registers_per_thread_per_radix[j] > 0) { while (raderContainer[i].registers_per_thread_per_radix[j] < min_registers_per_thread[0]) raderContainer[i].registers_per_thread_per_radix[j] += j; if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; } } } for (int64_t j = 2; j < 33; j++) { if (raderContainer[i].registers_per_thread_per_radix[j] != 0) { double scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim); while (((uint64_t)ceil(fftDim / (double)min_registers_per_thread[0])) < (raderContainer[i].containerFFTNum * scaling)) { raderContainer[i].registers_per_thread_per_radix[j] += j; scaling = (raderContainer[i].containerFFTDim > raderContainer[i].registers_per_thread_per_radix[j]) ? ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]) : 1.0 / floor(raderContainer[i].registers_per_thread_per_radix[j] / (double)raderContainer[i].containerFFTDim); } if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; } } if (raderContainer[i].registers_per_thread > registers_per_thread[0]) registers_per_thread[0] = raderContainer[i].registers_per_thread; } } //try to increase registers usage closer to registers_per_thread across all primes for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { if (raderContainer[i].type == 0) { for (int64_t j = 2; j < 33; j++) { if (raderContainer[i].registers_per_thread_per_radix[j] > 0) { while ((raderContainer[i].registers_per_thread_per_radix[j] + j) <= registers_per_thread[0] + 1) {// fix raderContainer[i].registers_per_thread_per_radix[j] += j; } } } raderContainer[i].registers_per_thread = 0; raderContainer[i].min_registers_per_thread = -1; for (int64_t j = 2; j < 33; j++) { if (raderContainer[i].registers_per_thread_per_radix[j] > 0) { if (raderContainer[i].registers_per_thread_per_radix[j] < raderContainer[i].min_registers_per_thread) { raderContainer[i].min_registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; } if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) { raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; } } } } } //subprimes optimization for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { if (raderContainer[i].numSubPrimes) { res = VkFFTOptimizeRaderFFTRegisters(raderContainer[i].container, raderContainer[i].numSubPrimes, fftDim, min_registers_per_thread, registers_per_thread, registers_per_thread_per_radix); if (res != VKFFT_SUCCESS) return res; } } return res; } static inline VkFFTResult VkFFTOptimizeRadixKernels(uint64_t* registers_per_thread_per_radix, uint64_t* loc_multipliers, uint64_t registerBoost, uint64_t* maxNonPow2Radix, uint64_t* reqLocRegs, VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes) { VkFFTResult res = VKFFT_SUCCESS; if (numRaderPrimes) { for (uint64_t i = 0; i < numRaderPrimes; i++) { res = VkFFTOptimizeRadixKernels(raderContainer[i].registers_per_thread_per_radix, raderContainer[i].loc_multipliers, 1, maxNonPow2Radix, reqLocRegs, raderContainer[i].container, raderContainer[i].numSubPrimes); if (res != VKFFT_SUCCESS) return res; } } //optimize used radix kernels if (((registers_per_thread_per_radix[32] > 0) || ((registers_per_thread_per_radix[2] % 32) == 0)) && ((registers_per_thread_per_radix[32]) % 32 == 0) && (loc_multipliers[2] >= 5)) { loc_multipliers[32] = loc_multipliers[2] / 5; loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[32] * 5; if ((registers_per_thread_per_radix[2] % 32) == 0) registers_per_thread_per_radix[32] = registers_per_thread_per_radix[2]; } if (((registers_per_thread_per_radix[16] > 0) || ((registers_per_thread_per_radix[2] % 16) == 0)) && ((registers_per_thread_per_radix[16]) % 16 == 0) && (loc_multipliers[2] >= 4)) { loc_multipliers[16] = loc_multipliers[2] / 4; loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[16] * 4; if ((registers_per_thread_per_radix[2] % 16) == 0) registers_per_thread_per_radix[16] = registers_per_thread_per_radix[2];//if we got 16 regs, why not use r16 kernel } if ((registers_per_thread_per_radix[15] > 0) && ((registers_per_thread_per_radix[15]) % 15 == 0) && (loc_multipliers[3] >= 1) && (loc_multipliers[5] >= 1)) { loc_multipliers[15] = (loc_multipliers[3] > loc_multipliers[5]) ? loc_multipliers[5] : loc_multipliers[3]; loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[15]; loc_multipliers[5] = loc_multipliers[5] - loc_multipliers[15]; } if ((registers_per_thread_per_radix[14] > 0) && ((registers_per_thread_per_radix[14]) % 14 == 0) && (loc_multipliers[2] >= 1) && (loc_multipliers[7] >= 1)) { loc_multipliers[14] = (loc_multipliers[2] > loc_multipliers[7]) ? loc_multipliers[7] : loc_multipliers[2]; loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[14]; loc_multipliers[7] = loc_multipliers[7] - loc_multipliers[14]; } if ((registers_per_thread_per_radix[12] > 0) && ((registers_per_thread_per_radix[12]) % 12 == 0) && (loc_multipliers[2] >= 2) && (loc_multipliers[3] >= 1)) { loc_multipliers[12] = (loc_multipliers[2] > 2 * loc_multipliers[3]) ? loc_multipliers[3] : loc_multipliers[2] / 2; loc_multipliers[2] = loc_multipliers[2] - 2 * loc_multipliers[12]; loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[12]; } if ((registers_per_thread_per_radix[10] > 0) && ((registers_per_thread_per_radix[10]) % 10 == 0) && (loc_multipliers[2] >= 1) && (loc_multipliers[5] >= 1)) { loc_multipliers[10] = (loc_multipliers[2] > loc_multipliers[5]) ? loc_multipliers[5] : loc_multipliers[2]; loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[10]; loc_multipliers[5] = loc_multipliers[5] - loc_multipliers[10]; } if ((registers_per_thread_per_radix[9] > 0) && ((registers_per_thread_per_radix[9]) % 9 == 0) && (loc_multipliers[3] >= 2)) { loc_multipliers[9] = loc_multipliers[3] / 2; loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[9] * 2; } if (((registers_per_thread_per_radix[8] > 0) || ((registers_per_thread_per_radix[2] % 8) == 0)) && ((registers_per_thread_per_radix[8]) % 8 == 0) && (loc_multipliers[2] >= 3)) { loc_multipliers[8] = loc_multipliers[2] / 3; loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3; if ((registers_per_thread_per_radix[2] % 8) == 0) registers_per_thread_per_radix[8] = registers_per_thread_per_radix[2]; } if ((registers_per_thread_per_radix[6] > 0) && ((registers_per_thread_per_radix[6]) % 6 == 0) && (loc_multipliers[2] >= 1) && (loc_multipliers[3] >= 1)) { loc_multipliers[6] = (loc_multipliers[2] > loc_multipliers[3]) ? loc_multipliers[3] : loc_multipliers[2]; loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[6]; loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[6]; } if (((registers_per_thread_per_radix[4] > 0) || ((registers_per_thread_per_radix[2] % 4) == 0)) && ((registers_per_thread_per_radix[4]) % 4 == 0) && (loc_multipliers[2] >= 2)) { loc_multipliers[4] = loc_multipliers[2] / 2; loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2; if ((registers_per_thread_per_radix[2] % 4) == 0) registers_per_thread_per_radix[4] = registers_per_thread_per_radix[2]; } if ((registerBoost == 2) && (loc_multipliers[2] == 0)) { if (loc_multipliers[4] > 0) { loc_multipliers[4]--; loc_multipliers[2] = 2; } else if (loc_multipliers[8] > 0) { loc_multipliers[8]--; loc_multipliers[4]++; loc_multipliers[2]++; } else if (loc_multipliers[16] > 0) { loc_multipliers[16]--; loc_multipliers[8]++; loc_multipliers[2]++; } else if (loc_multipliers[32] > 0) { loc_multipliers[32]--; loc_multipliers[16]++; loc_multipliers[2]++; } } if ((registerBoost == 4) && (loc_multipliers[4] == 0)) { if (loc_multipliers[8] > 0) { loc_multipliers[8]--; loc_multipliers[4]++; loc_multipliers[2]++; } else if (loc_multipliers[16] > 0) { if (loc_multipliers[2] == 0) { loc_multipliers[16]--; loc_multipliers[4] = 2; } else { loc_multipliers[16]--; loc_multipliers[4]++; loc_multipliers[2]--; loc_multipliers[8]++; } } else if (loc_multipliers[32] > 0) { if (loc_multipliers[2] == 0) { loc_multipliers[32]--; loc_multipliers[8]++; loc_multipliers[4]++; } else { loc_multipliers[32]--; loc_multipliers[16]++; loc_multipliers[4]++; loc_multipliers[2]--; } } } for (uint64_t i = 2; i < 33; i++) { uint64_t usedLocRegs = 0; if (loc_multipliers[i] > 0) { switch (i) { case 6: usedLocRegs = 3; break; case 9: usedLocRegs = 3; break; case 10: usedLocRegs = 5; break; case 12: usedLocRegs = 3; break; case 14: usedLocRegs = 7; break; case 15: usedLocRegs = 5; break; default: usedLocRegs = i; break; } } if ((loc_multipliers[i] > 0) && ((i & (i - 1)) != 0) && (i > maxNonPow2Radix[0])) { maxNonPow2Radix[0] = i; } if ((usedLocRegs > reqLocRegs[0]) && ((i & (i - 1)) != 0)) { reqLocRegs[0] = usedLocRegs; } } return res; } static inline VkFFTResult VkFFTGetRaderFFTStages(VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t* stageid, uint64_t* stageRadix, uint64_t* stage_rader_generator) { VkFFTResult res = VKFFT_SUCCESS; for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { if (raderContainer[i].multiplier > 0) { stageRadix[stageid[0]] = raderContainer[i].prime; stage_rader_generator[stageid[0]] = raderContainer[i].generator; raderContainer[i].multiplier--; i--; stageid[0]++; //axes[k].specializationConstants.numStages++; //find primitive root } } for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { if (raderContainer[i].type == 0) { if (raderContainer[i].numSubPrimes > 0) { res = VkFFTGetRaderFFTStages(raderContainer[i].container, raderContainer[i].numSubPrimes, &raderContainer[i].numStages, raderContainer[i].stageRadix, raderContainer[i].stage_rader_generator); if (res != VKFFT_SUCCESS) return res; } for (uint64_t j = 32; j > 1; j--) { if (raderContainer[i].loc_multipliers[j] > 0) { raderContainer[i].stageRadix[raderContainer[i].numStages] = j; raderContainer[i].loc_multipliers[j]--; j++; raderContainer[i].numStages++; } } /*//make that convolution step uses min_regs radix - max working threads uint64_t stage_id_swap = axes[k].specializationConstants.raderContainer[i].numStages - 1; uint64_t temp_radix = axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1]; uint64_t temp_regs = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1]]; for (uint64_t j = 0; j < axes[k].specializationConstants.raderContainer[i].numStages-1; j++) { if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[j]] < axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]) stage_id_swap = j; } axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1] = axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]; axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1]] = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]; axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap] = temp_radix; axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]] = temp_regs; //make that first step uses second to min_regs radix stage_id_swap = 0; temp_radix = axes[k].specializationConstants.raderContainer[i].stageRadix[0]; temp_regs = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[0]]; for (uint64_t j = 1; j < axes[k].specializationConstants.raderContainer[i].numStages - 1; j++) { if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[j]] < axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]) stage_id_swap = j; } axes[k].specializationConstants.raderContainer[i].stageRadix[0] = axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]; axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[0]] = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]; axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap] = temp_radix; axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]] = temp_regs; */ } } return res; } static inline VkFFTResult VkFFTMinMaxRegisterCheck(uint64_t numStages, uint64_t* stageRadix, uint64_t* min_registers_per_thread, uint64_t* registers_per_thread, uint64_t* registers_per_thread_per_radix, VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t* stage_rader_generator) { VkFFTResult res = VKFFT_SUCCESS; for (int64_t j = 0; j < (int64_t)numStages; j++) { if (stage_rader_generator[j] == 0) { if (registers_per_thread_per_radix[stageRadix[j]] > 0) { if (registers_per_thread_per_radix[stageRadix[j]] < min_registers_per_thread[0]) { min_registers_per_thread[0] = registers_per_thread_per_radix[stageRadix[j]]; } if (registers_per_thread_per_radix[stageRadix[j]] > registers_per_thread[0]) { registers_per_thread[0] = registers_per_thread_per_radix[stageRadix[j]]; } } } else { for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { if (raderContainer[i].prime == stageRadix[j]) { if (raderContainer[i].type == 0) { for (int64_t j2 = 0; j2 < (int64_t)raderContainer[i].numStages; j2++) { if (raderContainer[i].stage_rader_generator[j] == 0) { if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] > 0) { if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] < min_registers_per_thread[0]) { min_registers_per_thread[0] = raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]]; } if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] > registers_per_thread[0]) { registers_per_thread[0] = raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]]; } } } else { res = VkFFTMinMaxRegisterCheck(raderContainer[i].numStages, raderContainer[i].stageRadix, min_registers_per_thread, registers_per_thread, raderContainer[i].registers_per_thread_per_radix, raderContainer[i].container, raderContainer[i].numSubPrimes, raderContainer[i].stage_rader_generator); if (res != VKFFT_SUCCESS) return res; } } } } } } } return res; } static inline VkFFTResult VkFFTGetRaderFFTThreadsNum(VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t* numThreads) { VkFFTResult res = VKFFT_SUCCESS; for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { if (raderContainer[i].type == 0) { if (raderContainer[i].numSubPrimes > 0) { res = VkFFTGetRaderFFTThreadsNum(raderContainer[i].container, raderContainer[i].numSubPrimes, numThreads); if (res != VKFFT_SUCCESS) return res; } for (int64_t j = 0; j < (int64_t)raderContainer[i].numStages; j++) { if (raderContainer[i].stage_rader_generator[j] == 0) { if (raderContainer[i].containerFFTNum * (uint64_t)ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]) > numThreads[0]) numThreads[0] = raderContainer[i].containerFFTNum * (uint64_t)ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]); } } } } return res; } static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id) { VkFFTResult res = VKFFT_SUCCESS; VkFFTAxis* axes = FFTPlan->axes[axis_id]; uint64_t complexSize; if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) complexSize = (2 * sizeof(double)); else if (app->configuration.halfPrecision) complexSize = (2 * sizeof(float)); else complexSize = (2 * sizeof(float)); uint64_t usedSharedMemory = ((app->configuration.size[axis_id] & (app->configuration.size[axis_id] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : app->configuration.sharedMemorySize; uint64_t maxSequenceLengthSharedMemory = usedSharedMemory / complexSize; uint64_t maxSingleSizeNonStrided = maxSequenceLengthSharedMemory; uint64_t nonStridedAxisId = (app->configuration.considerAllAxesStrided) ? -1 : 0; uint64_t max_rhs = 1; for (uint64_t i = 0; i < 3; i++) { FFTPlan->actualFFTSizePerAxis[axis_id][i] = app->configuration.size[i]; if ((FFTPlan->actualFFTSizePerAxis[axis_id][i] > 0)) max_rhs *= FFTPlan->actualFFTSizePerAxis[axis_id][i]; } if (app->configuration.numberBatches > app->actualNumBatches) max_rhs *= app->configuration.numberBatches; else max_rhs *= app->actualNumBatches; if (app->configuration.coordinateFeatures > 0) max_rhs *= app->configuration.coordinateFeatures; if (app->configuration.numberKernels > 0) max_rhs *= app->configuration.numberKernels; FFTPlan->actualPerformR2CPerAxis[axis_id] = app->configuration.performR2C; if ((axis_id == 0) && (app->configuration.performR2C) && (app->configuration.size[axis_id] > maxSingleSizeNonStrided)) { FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT FFTPlan->actualPerformR2CPerAxis[axis_id] = 0; FFTPlan->multiUploadR2C = 1; } if (app->configuration.performDCT == 1) { FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = 2 * app->configuration.size[axis_id] - 2; // now in actualFFTSize - modified dimension size for R2C/DCT } if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT //FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] * 8; // now in actualFFTSize - modified dimension size for R2C/DCT } if ((axis_id > 0) && (app->configuration.performR2C)) { FFTPlan->actualFFTSizePerAxis[axis_id][0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / 2 + 1; } if (axis_id != nonStridedAxisId) { if (app->configuration.performBandwidthBoost > 0) axes->specializationConstants.performBandwidthBoost = app->configuration.performBandwidthBoost; } //initial Stockham + Rader check uint64_t multipliers[33]; for (uint64_t i = 0; i < 33; i++) { multipliers[i] = 0; } uint64_t tempSequence = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; for (uint64_t i = 2; i < app->configuration.fixMinRaderPrimeMult; i++) { if (tempSequence % i == 0) { tempSequence /= i; multipliers[i]++; i--; } } // verify that we haven't checked for 3 steps being not enougth for Rader before uint64_t forceRaderTwoUpload = 0; // for sequences like 17*1023 it is better to switch to two uploads for better occupancy. We will switch if one of the Rader primes requests more than 512 threads. if (!app->useBluesteinFFT[axis_id]) { uint64_t useRaderMult = 0; uint64_t rader_primes[20]; uint64_t rader_multipliers[20]; for (uint64_t i = 0; i < 20; i++) { rader_multipliers[i] = 0; rader_primes[i] = 0; } uint64_t tempSequence_temp = 1; uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; uint64_t limit_max_rader_prime = ((axis_id == nonStridedAxisId) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] <= maxSequenceLengthSharedMemory)) ? maxSequenceLengthSharedMemory : maxSequenceLengthSharedMemoryStrided_temp; if (limit_max_rader_prime > app->configuration.fixMaxRaderPrimeFFT) limit_max_rader_prime = app->configuration.fixMaxRaderPrimeFFT; for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < limit_max_rader_prime; i++) { if (tempSequence % i == 0) { if (i < app->configuration.fixMinRaderPrimeFFT) { tempSequence_temp *= i; tempSequence /= i; i--; continue; } //Sophie Germain safe prime check uint64_t tempSequence2 = i - 1; for (uint64_t j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) { if (tempSequence2 % j == 0) { tempSequence2 /= j; j--; } } if (tempSequence2 != 1) { maxSequenceLengthSharedMemory = (usedSharedMemory - (i - 1) * complexSize) / complexSize; maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? (usedSharedMemory - (i - 1) * complexSize) / (app->configuration.coalescedMemory) : (usedSharedMemory - (i - 1) * complexSize) / complexSize; limit_max_rader_prime = ((axis_id == nonStridedAxisId) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] <= maxSequenceLengthSharedMemory)) ? maxSequenceLengthSharedMemory : maxSequenceLengthSharedMemoryStrided_temp; tempSequence_temp *= i; tempSequence /= i; i--; continue; } tempSequence /= i; if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / i > 512) forceRaderTwoUpload = 1; for (uint64_t j = 0; j < 20; j++) { if (rader_primes[j] == i) { rader_multipliers[j]++; j = 20; } else if (rader_primes[j] == 0) { rader_primes[j] = i; rader_multipliers[j]++; j = 20; } } i--; } } tempSequence *= tempSequence_temp; uint64_t maxRaderPrimeFromThreadNumCoalesced = (app->configuration.maxThreadsNum / (app->configuration.coalescedMemory / complexSize)) * 2 - 1; if (maxRaderPrimeFromThreadNumCoalesced < app->configuration.fixMaxRaderPrimeMult) app->configuration.fixMaxRaderPrimeMult = maxRaderPrimeFromThreadNumCoalesced; for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < app->configuration.fixMaxRaderPrimeMult; i++) { if (tempSequence % i == 0) { tempSequence /= i; for (uint64_t j = 0; j < 20; j++) { if (rader_primes[j] == i) { rader_multipliers[j]++; j = 20; } else if (rader_primes[j] == 0) { rader_primes[j] = i; rader_multipliers[j]++; j = 20; } } useRaderMult = i; i--; } } if (tempSequence != 1) { useRaderMult = 0; forceRaderTwoUpload = 0; } if (useRaderMult) { if (tempSequence == 1) usedSharedMemory -= (useRaderMult - 1) * complexSize; //reserve memory for Rader } maxSequenceLengthSharedMemory = usedSharedMemory / complexSize; maxSingleSizeNonStrided = maxSequenceLengthSharedMemory; //check once again for R2C if ((axis_id == 0) && (app->configuration.performR2C) && (tempSequence == 1) && ((app->configuration.size[axis_id] > maxSingleSizeNonStrided) || forceRaderTwoUpload)) { FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT FFTPlan->actualPerformR2CPerAxis[axis_id] = 0; FFTPlan->multiUploadR2C = 1; } } //initial Bluestein check if (tempSequence != 1) { app->useBluesteinFFT[axis_id] = 1; if (axis_id != nonStridedAxisId) { if (app->configuration.performBandwidthBoost == 0) axes->specializationConstants.performBandwidthBoost = 1; } app->configuration.registerBoost = 1; tempSequence = 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1; uint64_t FFTSizeSelected = 0; if (((app->configuration.useCustomBluesteinPaddingPattern > 0) || (app->configuration.autoCustomBluesteinPaddingPattern)) && (!app->configuration.fixMaxRadixBluestein)) { uint64_t arr_limit = (app->configuration.useCustomBluesteinPaddingPattern) ? app->configuration.useCustomBluesteinPaddingPattern : app->configuration.autoCustomBluesteinPaddingPattern; for (uint64_t i = 0; i < arr_limit; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.primeSizes[i]) { if (i != (arr_limit - 1)) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < app->configuration.primeSizes[i + 1]) { tempSequence = app->configuration.paddedSizes[i]; FFTSizeSelected = 1; i = arr_limit; } } else { if ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) <= app->configuration.paddedSizes[i]) { tempSequence = app->configuration.paddedSizes[i]; FFTSizeSelected = 1; i = arr_limit; } } } } } if (app->configuration.fixMaxRadixBluestein > 0) { while (!FFTSizeSelected) { uint64_t testSequence = tempSequence; for (uint64_t i = 0; i < 33; i++) { multipliers[i] = 0; } for (uint64_t i = 2; i < app->configuration.fixMaxRadixBluestein + 1; i++) { if (testSequence % i == 0) { testSequence /= i; multipliers[i]++; i--; } } if (testSequence == 1) FFTSizeSelected = 1; else tempSequence++; } } else { while (!FFTSizeSelected) { if (axis_id == nonStridedAxisId) { if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); } else { uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); } uint64_t testSequence = tempSequence; for (uint64_t i = 0; i < 33; i++) { multipliers[i] = 0; } for (uint64_t i = 2; i < 8; i++) { if (testSequence % i == 0) { testSequence /= i; multipliers[i]++; i--; } } if (testSequence != 1) tempSequence++; else { uint64_t registers_per_thread_per_radix[33]; uint64_t registers_per_thread = 0; uint64_t min_registers_per_thread = -1; uint64_t isGoodSequence = 0; res = VkFFTGetRegistersPerThread(app, tempSequence, 0, max_rhs / tempSequence, axes->specializationConstants.useRader, multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence); if (res != VKFFT_SUCCESS) return res; if (isGoodSequence) FFTSizeSelected = 1; else tempSequence++; } } } FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = tempSequence; //check if padded system still single upload for r2c - else redo the optimization if ((axis_id == 0) && (app->configuration.performR2C) && (!FFTPlan->multiUploadR2C) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] > maxSingleSizeNonStrided)) { FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT FFTPlan->actualPerformR2CPerAxis[axis_id] = 0; FFTPlan->multiUploadR2C = 1; tempSequence = 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1; FFTSizeSelected = 0; if (((app->configuration.useCustomBluesteinPaddingPattern > 0) || (app->configuration.autoCustomBluesteinPaddingPattern)) && (!app->configuration.fixMaxRadixBluestein)) { uint64_t arr_limit = (app->configuration.useCustomBluesteinPaddingPattern) ? app->configuration.useCustomBluesteinPaddingPattern : app->configuration.autoCustomBluesteinPaddingPattern; for (uint64_t i = 0; i < arr_limit; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.primeSizes[i]) { if (i != (arr_limit - 1)) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < app->configuration.primeSizes[i + 1]) { tempSequence = app->configuration.paddedSizes[i]; FFTSizeSelected = 1; } } else { if ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) <= app->configuration.paddedSizes[i]) { tempSequence = app->configuration.paddedSizes[i]; FFTSizeSelected = 1; } } } } } if (app->configuration.fixMaxRadixBluestein > 0) { while (!FFTSizeSelected) { uint64_t testSequence = tempSequence; for (uint64_t i = 0; i < 33; i++) { multipliers[i] = 0; } for (uint64_t i = 2; i < app->configuration.fixMaxRadixBluestein + 1; i++) { if (testSequence % i == 0) { testSequence /= i; multipliers[i]++; i--; } } if (testSequence == 1) FFTSizeSelected = 1; else tempSequence++; } } else { while (!FFTSizeSelected) { if (axis_id == nonStridedAxisId) { if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); } else { uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); } uint64_t testSequence = tempSequence; for (uint64_t i = 0; i < 33; i++) { multipliers[i] = 0; } for (uint64_t i = 2; i < 8; i++) { if (testSequence % i == 0) { testSequence /= i; multipliers[i]++; i--; } } if (testSequence != 1) tempSequence++; else { uint64_t registers_per_thread_per_radix[33]; uint64_t registers_per_thread = 0; uint64_t min_registers_per_thread = -1; uint64_t isGoodSequence = 0; res = VkFFTGetRegistersPerThread(app, tempSequence, 0, max_rhs / tempSequence, axes->specializationConstants.useRader, multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence); if (res != VKFFT_SUCCESS) return res; if (isGoodSequence) FFTSizeSelected = 1; else tempSequence++; } } } FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = tempSequence; } if (app->configuration.forceBluesteinSequenceSize) FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.forceBluesteinSequenceSize; if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] & (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1)) == 0) { usedSharedMemory = app->configuration.sharedMemorySizePow2; maxSequenceLengthSharedMemory = usedSharedMemory / complexSize; maxSingleSizeNonStrided = maxSequenceLengthSharedMemory; } } uint64_t isPowOf2 = (pow(2, (uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id])) == FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) ? 1 : 0; uint64_t locNumBatches = (app->configuration.numberBatches > app->actualNumBatches) ? app->configuration.numberBatches : app->actualNumBatches; //return VKFFT_ERROR_UNSUPPORTED_RADIX; uint64_t registerBoost = 1; for (uint64_t i = 1; i <= app->configuration.registerBoost; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) registerBoost = i; } if ((axis_id == nonStridedAxisId) && (!app->configuration.performConvolution)) maxSingleSizeNonStrided *= registerBoost; uint64_t maxSequenceLengthSharedMemoryStrided = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; uint64_t maxSingleSizeStrided = (!app->configuration.performConvolution) ? maxSequenceLengthSharedMemoryStrided * registerBoost : maxSequenceLengthSharedMemoryStrided; uint64_t numPasses = 1; uint64_t numPassesHalfBandwidth = 1; uint64_t temp; temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStrided); if (temp > 1) {//more passes than one for (uint64_t i = 1; i <= app->configuration.registerBoost4Step; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) { registerBoost = i; } } if ((!app->configuration.performConvolution)) maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost; if ((!app->configuration.performConvolution)) maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost; temp = ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided; if (app->configuration.reorderFourStep && (!app->useBluesteinFFT[axis_id])) numPasses = (uint64_t)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided)); else numPasses += (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStrided)); } registerBoost = ((axis_id == nonStridedAxisId) && ((app->useBluesteinFFT[axis_id]) || (!app->configuration.reorderFourStep) || (numPasses == 1))) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)(pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)pow(maxSequenceLengthSharedMemoryStrided, numPasses)); uint64_t canBoost = 0; for (uint64_t i = registerBoost; i <= app->configuration.registerBoost; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) { registerBoost = i; i = app->configuration.registerBoost + 1; canBoost = 1; } } if (((canBoost == 0) || (((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] & (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1)) != 0) && (!app->configuration.registerBoostNonPow2))) && (registerBoost > 1)) { registerBoost = 1; numPasses++; } maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost; maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost; uint64_t maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided; if ((axes->specializationConstants.performBandwidthBoost)) { maxSingleSizeStridedHalfBandwidth = (app->configuration.coalescedMemory / axes->specializationConstants.performBandwidthBoost > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory / axes->specializationConstants.performBandwidthBoost) : usedSharedMemory / complexSize; temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth); //temp = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided; if (temp > 1) {//more passes than two temp = ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id])) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth); for (uint64_t i = 0; i < 5; i++) { temp = (uint64_t)ceil(temp / (double)maxSingleSizeStrided); numPassesHalfBandwidth++; if (temp == 1) i = 5; } /* temp = ((axis_id == 0) && (!app->configuration.reorderFourStep)) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStridedHalfBandwidth; if (app->configuration.reorderFourStep) numPassesHalfBandwidth = (uint64_t)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStridedHalfBandwidth)); else numPassesHalfBandwidth = 1 + (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStridedHalfBandwidth)); if ((numPassesHalfBandwidth == 2)&& (!app->configuration.reorderFourStep)&&(registerBoost>1)) //switch back for two step and don't do half bandwidth on strided accesses if register boost and no 4-step reordering */ } if (numPassesHalfBandwidth < numPasses) numPasses = numPassesHalfBandwidth; else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided; } if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.swapTo3Stage4Step) && (app->configuration.swapTo3Stage4Step >= 131072)) numPasses = 3;//Force set to 3 stage 4 step algorithm if (forceRaderTwoUpload && (numPasses == 1)) numPasses = 2;//Force set Rader cases that use more than 512 threads per one of Rader primes uint64_t* locAxisSplit = FFTPlan->axisSplit[axis_id]; if (numPasses == 1) { locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; } if (numPasses == 2) { if (isPowOf2 && (!((app->configuration.vendorID == 0x10DE) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] > 262144)))) { if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3); //unit stride if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxSingleSizeStrided) { locAxisSplit[0] = maxPow8SharedMemory; } else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided) { locAxisSplit[0] = maxSequenceLengthSharedMemory; } else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) < maxSingleSizeStridedHalfBandwidth) { for (uint64_t i = 1; i <= (uint64_t)log2(registerBoost); i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided) { locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)); i = (uint64_t)log2(registerBoost) + 1; } } } else { locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost); } } } } else { uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3); if (maxPow8Strided > 512) maxPow8Strided = 512; //all FFTs are considered as non-unit stride if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8Strided <= maxSingleSizeStrided) { locAxisSplit[0] = maxPow8Strided; } else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided < maxSingleSizeStridedHalfBandwidth) { locAxisSplit[0] = maxSingleSizeStrided; } else { locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth; } } } locAxisSplit[1] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0]; if (locAxisSplit[1] < 64) { locAxisSplit[0] = (locAxisSplit[1] == 0) ? locAxisSplit[0] / (64) : locAxisSplit[0] / (64 / locAxisSplit[1]); locAxisSplit[1] = 64; } if (locAxisSplit[1] > locAxisSplit[0]) { uint64_t swap = locAxisSplit[0]; locAxisSplit[0] = locAxisSplit[1]; locAxisSplit[1] = swap; } } else { uint64_t successSplit = 0; if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { /*for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) { if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) <= maxSingleSizeStrided)) { locAxisSplit[0] = (maxSequenceLengthSharedMemory - i); locAxisSplit[1] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i); i = maxSequenceLengthSharedMemory; successSplit = 1; } } }*/ uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id])); for (uint64_t i = 0; i < sqrtSequence; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) { if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSequenceLengthSharedMemory)) { locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i); locAxisSplit[1] = sqrtSequence - i; i = sqrtSequence; successSplit = 1; } } } } else { uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id])); for (uint64_t i = 0; i < sqrtSequence; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) { if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSingleSizeStridedHalfBandwidth)) { locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i); locAxisSplit[1] = sqrtSequence - i; i = sqrtSequence; successSplit = 1; } } } } if (successSplit == 0) numPasses = 3; } } if (numPasses == 3) { if (isPowOf2 && (!((app->configuration.vendorID == 0x10DE) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] > 262144)))) { uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3); if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { //unit stride uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3); if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxPow8Strided * maxPow8Strided) locAxisSplit[0] = maxPow8SharedMemory; else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided * maxSingleSizeStrided) locAxisSplit[0] = maxSequenceLengthSharedMemory; else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) <= maxSingleSizeStrided * maxSingleSizeStrided) { for (uint64_t i = 0; i <= (uint64_t)log2(registerBoost); i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) { locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)); i = (uint64_t)log2(registerBoost) + 1; } } } else { locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost); } } } } else { //to account for TLB misses, it is best to coalesce the unit-strided stage to 128 bytes /*uint64_t log2axis = (uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]); locAxisSplit[0] = (uint64_t)pow(2, (uint64_t)log2axis / 3); if (log2axis % 3 > 0) locAxisSplit[0] *= 2; locAxisSplit[1] = (uint64_t)pow(2, (uint64_t)log2axis / 3); if (log2axis % 3 > 1) locAxisSplit[1] *= 2; locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / locAxisSplit[1];*/ uint64_t maxSingleSizeStrided128 = usedSharedMemory / (128); uint64_t maxPow8_128 = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided128)) / 3); //unit stride if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8_128 <= maxPow8Strided * maxSingleSizeStrided) locAxisSplit[0] = maxPow8_128; //non-unit stride else { if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 2) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 2 <= maxSingleSizeStrided128)) { locAxisSplit[0] = maxPow8_128 * 2; } else { if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 4) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 4 <= maxSingleSizeStrided128)) { locAxisSplit[0] = maxPow8_128 * 4; } else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided <= maxSingleSizeStrided * maxSingleSizeStrided) { for (uint64_t i = 0; i <= (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128); i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSingleSizeStrided128 * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) { locAxisSplit[0] = (maxSingleSizeStrided128 * (uint64_t)pow(2, i)); i = (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128) + 1; } } } else locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth; } } } } if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] < maxPow8Strided) { locAxisSplit[1] = (uint64_t)pow(2, (uint64_t)(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0]) / 2)); locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / locAxisSplit[1]; } else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxPow8Strided <= maxSingleSizeStrided) { locAxisSplit[1] = maxPow8Strided; locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0]; } else { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxSingleSizeStrided <= maxSingleSizeStrided) { locAxisSplit[1] = maxSingleSizeStrided; locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0]; } else { locAxisSplit[1] = maxSingleSizeStridedHalfBandwidth; locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0]; } } if (locAxisSplit[2] < 64) { locAxisSplit[1] = (locAxisSplit[2] == 0) ? locAxisSplit[1] / (64) : locAxisSplit[1] / (64 / locAxisSplit[2]); locAxisSplit[2] = 64; } } if (locAxisSplit[2] > locAxisSplit[1]) { uint64_t swap = locAxisSplit[1]; locAxisSplit[1] = locAxisSplit[2]; locAxisSplit[2] = swap; } } else { uint64_t successSplit = 0; if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) { uint64_t sqrt3Sequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i))); for (uint64_t j = 0; j < sqrt3Sequence; j++) { if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)) % (sqrt3Sequence - j) == 0) { if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (sqrt3Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j) <= maxSingleSizeStrided)) { locAxisSplit[0] = (maxSequenceLengthSharedMemory - i); locAxisSplit[1] = sqrt3Sequence - j; locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j); i = maxSequenceLengthSharedMemory; j = sqrt3Sequence; successSplit = 1; } } } } } } else { uint64_t sqrt3Sequence = (uint64_t)ceil(pow(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id], 1.0 / 3.0)); for (uint64_t i = 0; i < sqrt3Sequence; i++) { if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrt3Sequence - i) == 0) { uint64_t sqrt2Sequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i))); for (uint64_t j = 0; j < sqrt2Sequence; j++) { if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)) % (sqrt2Sequence - j) == 0) { if ((sqrt3Sequence - i <= maxSingleSizeStrided) && (sqrt2Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j) <= maxSingleSizeStridedHalfBandwidth)) { locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j); locAxisSplit[1] = sqrt3Sequence - i; locAxisSplit[2] = sqrt2Sequence - j; i = sqrt3Sequence; j = sqrt2Sequence; successSplit = 1; } } } } } } if (successSplit == 0) numPasses = 4; } } if (numPasses > 3) { //printf("sequence length exceeds boundaries\n"); return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH; } if ((numPasses > 1) && (app->configuration.performDCT > 0)) { //printf("sequence length exceeds boundaries\n"); return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT; } if ((numPasses > 1) && (app->configuration.performR2C > 0) && (axis_id == 0) && (app->configuration.size[axis_id] % 2 != 0)) { //printf("sequence length exceeds boundaries\n"); return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C; } if (app->configuration.userTempBuffer == 0) { uint64_t tempBufferSize = 1; if ((app->configuration.performR2C) && (axis_id == 0)) { if (FFTPlan->multiUploadR2C) { tempBufferSize = 1; tempBufferSize *= (app->configuration.bufferStride[0] > (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1)) ? app->configuration.bufferStride[0] : (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1); tempBufferSize *= ((app->configuration.bufferStride[1] / app->configuration.bufferStride[0]) > FFTPlan->actualFFTSizePerAxis[axis_id][1]) ? (app->configuration.bufferStride[1] / app->configuration.bufferStride[0]) : FFTPlan->actualFFTSizePerAxis[axis_id][1]; tempBufferSize *= ((app->configuration.bufferStride[2] / app->configuration.bufferStride[1]) > FFTPlan->actualFFTSizePerAxis[axis_id][2]) ? (app->configuration.bufferStride[2] / app->configuration.bufferStride[1]) : FFTPlan->actualFFTSizePerAxis[axis_id][2]; tempBufferSize *= app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize; //app->configuration.tempBufferSize[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize; } } else { tempBufferSize = 1; tempBufferSize *= (app->configuration.bufferStride[0] > FFTPlan->actualFFTSizePerAxis[axis_id][0]) ? app->configuration.bufferStride[0] : FFTPlan->actualFFTSizePerAxis[axis_id][0]; tempBufferSize *= ((app->configuration.bufferStride[1] / app->configuration.bufferStride[0]) > FFTPlan->actualFFTSizePerAxis[axis_id][1]) ? (app->configuration.bufferStride[1] / app->configuration.bufferStride[0]) : FFTPlan->actualFFTSizePerAxis[axis_id][1]; tempBufferSize *= ((app->configuration.bufferStride[2] / app->configuration.bufferStride[1]) > FFTPlan->actualFFTSizePerAxis[axis_id][2]) ? (app->configuration.bufferStride[2] / app->configuration.bufferStride[1]) : FFTPlan->actualFFTSizePerAxis[axis_id][2]; tempBufferSize *= app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize; //FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures* locNumBatches* app->configuration.numberKernels* complexSize; } if (tempBufferSize > app->configuration.tempBufferSize[0]) app->configuration.tempBufferSize[0] = tempBufferSize; } if (((app->configuration.reorderFourStep) && (!app->useBluesteinFFT[axis_id]))) { for (uint64_t i = 0; i < numPasses; i++) { if ((locAxisSplit[0] % 2 != 0) && (locAxisSplit[i] % 2 == 0)) { uint64_t swap = locAxisSplit[0]; locAxisSplit[0] = locAxisSplit[i]; locAxisSplit[i] = swap; } } for (uint64_t i = 0; i < numPasses; i++) { if ((locAxisSplit[0] % 4 != 0) && (locAxisSplit[i] % 4 == 0)) { uint64_t swap = locAxisSplit[0]; locAxisSplit[0] = locAxisSplit[i]; locAxisSplit[i] = swap; } } for (uint64_t i = 0; i < numPasses; i++) { if ((locAxisSplit[0] % 8 != 0) && (locAxisSplit[i] % 8 == 0)) { uint64_t swap = locAxisSplit[0]; locAxisSplit[0] = locAxisSplit[i]; locAxisSplit[i] = swap; } } } FFTPlan->numAxisUploads[axis_id] = numPasses; for (uint64_t k = 0; k < numPasses; k++) { tempSequence = locAxisSplit[k]; uint64_t loc_multipliers[33]; //split the smaller sequence //split the smaller sequence //uint64_t rader_multipliers[20]; //split the smaller sequence //uint64_t* rader_generator = axes[k].specializationConstants.rader_generator_sorted; //split the smaller sequence //uint64_t* rader_primes = axes[k].specializationConstants.rader_primes; for (uint64_t i = 0; i < 33; i++) { loc_multipliers[i] = 0; } for (uint64_t i = 2; i < app->configuration.fixMinRaderPrimeMult; i++) { if (tempSequence % i == 0) { tempSequence /= i; loc_multipliers[i]++; i--; } } axes[k].specializationConstants.useRader = 0; axes[k].specializationConstants.useRaderMult = 0; axes[k].specializationConstants.useRaderFFT = 0; if (tempSequence != 1) { res = VkFFTConstructRaderTree(app, &axes[k].specializationConstants.raderContainer, &tempSequence, &axes[k].specializationConstants.numRaderPrimes, locAxisSplit[k] / tempSequence); if (res != VKFFT_SUCCESS) return res; } for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 0) { if (axes[k].specializationConstants.useRaderFFT < axes[k].specializationConstants.raderContainer[i].prime) axes[k].specializationConstants.useRaderFFT = axes[k].specializationConstants.raderContainer[i].prime; } else { if (axes[k].specializationConstants.useRaderMult < axes[k].specializationConstants.raderContainer[i].prime) axes[k].specializationConstants.useRaderMult = axes[k].specializationConstants.raderContainer[i].prime; } } if (axes[k].specializationConstants.useRaderMult) { app->configuration.useLUT = 1; // workaround, Mult Rader is better with LUT } axes[k].specializationConstants.useRader = axes[k].specializationConstants.numRaderPrimes; if ((axes[k].specializationConstants.useRader) && (app->configuration.useRaderUintLUT)) { app->configuration.useLUT = 1; // useRaderUintLUT forces LUT } uint64_t registers_per_thread_per_radix[33]; uint64_t registers_per_thread = 0; uint64_t min_registers_per_thread = -1; uint64_t isGoodSequence = 0; uint64_t extraSharedMemoryForPow2 = ((app->configuration.sharedMemorySizePow2 < app->configuration.sharedMemorySize) || ((locAxisSplit[k] < maxSingleSizeNonStrided) && ((axis_id == nonStridedAxisId))) || ((locAxisSplit[k] < maxSingleSizeStrided) && ((axis_id != nonStridedAxisId)))) ? 1 : 0; res = VkFFTGetRegistersPerThread(app, locAxisSplit[k], extraSharedMemoryForPow2, max_rhs / locAxisSplit[k], axes[k].specializationConstants.numRaderPrimes, loc_multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence); if (res != VKFFT_SUCCESS) return res; //first optimizer pass if (axes[k].specializationConstants.numRaderPrimes) { res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); if (res != VKFFT_SUCCESS) return res; /*for (int64_t i = 0; i < axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 0) { if (axes[k].specializationConstants.raderContainer[i].min_registers_per_thread / min_registers_per_thread >= 2) { min_registers_per_thread *= (axes[k].specializationConstants.raderContainer[i].min_registers_per_thread / min_registers_per_thread); for (uint64_t j = 0; j < 33; j++) { if ((registers_per_thread_per_radix[j] > 0) && (registers_per_thread_per_radix[j] < min_registers_per_thread)) registers_per_thread_per_radix[j] *= (uint64_t)ceil(min_registers_per_thread / (double)registers_per_thread_per_radix[j]); } for (uint64_t j = 0; j < 33; j++) { if (registers_per_thread_per_radix[j] > registers_per_thread) registers_per_thread = registers_per_thread_per_radix[j]; } } else if (min_registers_per_thread / axes[k].specializationConstants.raderContainer[i].min_registers_per_thread >= 2) { axes[k].specializationConstants.raderContainer[i].min_registers_per_thread *= (min_registers_per_thread / axes[k].specializationConstants.raderContainer[i].min_registers_per_thread); for (uint64_t j = 0; j < 33; j++) { if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < axes[k].specializationConstants.raderContainer[i].min_registers_per_thread)) axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] *= (uint64_t)ceil(axes[k].specializationConstants.raderContainer[i].min_registers_per_thread / (double)axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]); } for (uint64_t j = 0; j < 33; j++) { if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > axes[k].specializationConstants.raderContainer[i].registers_per_thread) axes[k].specializationConstants.raderContainer[i].registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; } } if (axes[k].specializationConstants.raderContainer[i].registers_per_thread > registers_per_thread) registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread; if (axes[k].specializationConstants.raderContainer[i].min_registers_per_thread < min_registers_per_thread) min_registers_per_thread = axes[k].specializationConstants.raderContainer[i].min_registers_per_thread; } }*/ } if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) { registers_per_thread *= 2; for (uint64_t i = 2; i < 33; i++) { registers_per_thread_per_radix[i] *= 2; } min_registers_per_thread *= 2; } uint64_t maxBatchCoalesced = ((axis_id == 0) && (((k == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (numPasses == 1))) ? 1 : app->configuration.coalescedMemory / complexSize; uint64_t estimate_rader_threadnum = 0; uint64_t scale_registers_rader = 0; uint64_t rader_min_registers = min_registers_per_thread; if (axes[k].specializationConstants.useRaderMult) { for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 1) { uint64_t temp_rader = (uint64_t)ceil((locAxisSplit[k] / (double)((rader_min_registers / 2 + scale_registers_rader) * 2)) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)); uint64_t active_rader = (uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader); if (active_rader > 1) { if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; } uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2) * maxBatchCoalesced; if ((maxBatchCoalesced * locAxisSplit[k] / ((rader_min_registers / 2 + scale_registers_rader) * 2 * registerBoost)) > local_estimate_rader_threadnum) local_estimate_rader_threadnum = (maxBatchCoalesced * locAxisSplit[k] / ((rader_min_registers / 2 + scale_registers_rader) * 2 * registerBoost)); if ((local_estimate_rader_threadnum > app->configuration.maxThreadsNum) || ((((locAxisSplit[k] / min_registers_per_thread) > 256) || (local_estimate_rader_threadnum > 256)) && (((rader_min_registers / 2 + scale_registers_rader) * 2) <= 4))) { scale_registers_rader++; i = -1; } else { estimate_rader_threadnum = (estimate_rader_threadnum < local_estimate_rader_threadnum) ? local_estimate_rader_threadnum : estimate_rader_threadnum; } } } rader_min_registers = (rader_min_registers / 2 + scale_registers_rader) * 2;//min number of registers for Rader (can be more than min_registers_per_thread, but min_registers_per_thread should be at least 4 for Nvidiaif you have >256 threads) if (registers_per_thread < rader_min_registers) registers_per_thread = rader_min_registers; for (uint64_t i = 2; i < 33; i++) { if (registers_per_thread_per_radix[i] != 0) { if (registers_per_thread / registers_per_thread_per_radix[i] >= 2) { registers_per_thread_per_radix[i] *= (registers_per_thread / registers_per_thread_per_radix[i]); } } } for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 0) { for (uint64_t j = 2; j < 33; j++) { if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] != 0) { if (registers_per_thread / axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] >= 2) { axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] *= (registers_per_thread / axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]); } } } } } uint64_t new_min_registers = -1; for (uint64_t i = 2; i < 33; i++) { if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < new_min_registers)) new_min_registers = registers_per_thread_per_radix[i]; if (registers_per_thread_per_radix[i] > registers_per_thread) { registers_per_thread = registers_per_thread_per_radix[i]; } } for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 0) { for (uint64_t j = 2; j < 33; j++) { if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < new_min_registers)) new_min_registers = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > registers_per_thread) { registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; } } } } min_registers_per_thread = (new_min_registers == -1) ? registers_per_thread : new_min_registers; } if ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost) > app->configuration.maxThreadsNum) || (axes[k].specializationConstants.useRader && (estimate_rader_threadnum > app->configuration.maxThreadsNum))) { uint64_t scaleRegistersNum = 1; if ((axis_id == 0) && (k == 0) && (maxBatchCoalesced > 1)) { maxBatchCoalesced = app->configuration.maxThreadsNum * (min_registers_per_thread * registerBoost) / locAxisSplit[k]; if (maxBatchCoalesced < 1) maxBatchCoalesced = 1; } if ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum)) > app->configuration.maxThreadsNum) { for (uint64_t i = 2; i < locAxisSplit[k]; i++) { if (((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * i)) <= app->configuration.maxThreadsNum)) { scaleRegistersNum = i; i = locAxisSplit[k]; } } } min_registers_per_thread *= scaleRegistersNum; registers_per_thread *= scaleRegistersNum; for (uint64_t i = 2; i < 33; i++) { if (registers_per_thread_per_radix[i] != 0) { registers_per_thread_per_radix[i] *= scaleRegistersNum; } } uint64_t new_min_registers = -1; for (uint64_t i = 2; i < 33; i++) { if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < new_min_registers)) new_min_registers = registers_per_thread_per_radix[i]; } for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 0) { for (uint64_t j = 2; j < 33; j++) { if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < new_min_registers)) new_min_registers = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; } } } if ((maxBatchCoalesced * locAxisSplit[k] / (new_min_registers * registerBoost)) > app->configuration.maxThreadsNum) { // if we get here, there can be trouble with small primes, as we can have one thread do at max one fftDim. This is only an issue for small primes in sequences close to shared memory limit sizes for extremely big shared memory sizes (>136KB) for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 0) { for (uint64_t j = 2; j < 33; j++) { if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] != 0) { axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] *= scaleRegistersNum; } } } } } else { min_registers_per_thread = new_min_registers; } if (min_registers_per_thread > registers_per_thread) { temp = min_registers_per_thread; min_registers_per_thread = registers_per_thread; registers_per_thread = temp; } for (uint64_t i = 2; i < 33; i++) { if (registers_per_thread_per_radix[i] > registers_per_thread) { registers_per_thread = registers_per_thread_per_radix[i]; } if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) { min_registers_per_thread = registers_per_thread_per_radix[i]; } } for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 0) { for (uint64_t j = 2; j < 33; j++) { if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > registers_per_thread) { registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; } if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < min_registers_per_thread)) { min_registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; } } } } if ((loc_multipliers[3] >= 2) && (((registers_per_thread / min_registers_per_thread) % 3) == 0) && (axes[k].specializationConstants.numRaderPrimes == 0)) { registers_per_thread /= 3; for (uint64_t i = 2; i < 33; i++) { if (registers_per_thread_per_radix[i] % 9 == 0) { registers_per_thread_per_radix[i] /= 3; } } for (uint64_t i = 2; i < 33; i++) { if (registers_per_thread_per_radix[i] > registers_per_thread) { registers_per_thread = registers_per_thread_per_radix[i]; } if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) { min_registers_per_thread = registers_per_thread_per_radix[i]; } } } } //second optimizer pass if (axes[k].specializationConstants.numRaderPrimes) { res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); if (res != VKFFT_SUCCESS) return res; } axes[k].specializationConstants.maxNonPow2Radix = 1; axes[k].specializationConstants.usedLocRegs = 1; res = VkFFTOptimizeRadixKernels(registers_per_thread_per_radix, loc_multipliers, registerBoost, &axes[k].specializationConstants.maxNonPow2Radix, &axes[k].specializationConstants.usedLocRegs, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 2; i < 33; i++) { axes[k].specializationConstants.registers_per_thread_per_radix[i] = registers_per_thread_per_radix[i]; } axes[k].specializationConstants.numStages = 0; axes[k].specializationConstants.fftDim = locAxisSplit[k]; uint64_t tempRegisterBoost = registerBoost;// ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep)||(app->useBluesteinFFT[axis_id]))) ? (uint64_t)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeStrided); uint64_t switchRegisterBoost = 0; if (tempRegisterBoost > 1) { if (loc_multipliers[tempRegisterBoost] > 0) { loc_multipliers[tempRegisterBoost]--; switchRegisterBoost = tempRegisterBoost; } else { for (uint64_t i = 32; i > 1; i--) { if (loc_multipliers[i] > 0) { loc_multipliers[i]--; switchRegisterBoost = i; i = 1; } } } } res = VkFFTGetRaderFFTStages(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, &axes[k].specializationConstants.numStages, axes[k].specializationConstants.stageRadix, axes[k].specializationConstants.rader_generator); if (res != VKFFT_SUCCESS) return res; for (uint64_t i = 32; i > 1; i--) { if (loc_multipliers[i] > 0) { axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages] = i; loc_multipliers[i]--; i++; axes[k].specializationConstants.numStages++; } } //add more registers for Rader FFT if needed if (axes[k].specializationConstants.useRaderMult) { axes[k].specializationConstants.rader_min_registers = rader_min_registers; for (uint64_t i = 0; i < axes[k].specializationConstants.numRaderPrimes; i++) { if (axes[k].specializationConstants.raderContainer[i].type == 1) { uint64_t temp_rader = (uint64_t)ceil((locAxisSplit[k] / (double)axes[k].specializationConstants.rader_min_registers) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)); uint64_t active_rader = (uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader); if (active_rader > 1) { if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; } axes[k].specializationConstants.raderRegisters = (active_rader * 2 > axes[k].specializationConstants.raderRegisters) ? active_rader * 2 : axes[k].specializationConstants.raderRegisters; if (active_rader * 2 > registers_per_thread) registers_per_thread = active_rader * 2; } } if (axes[k].specializationConstants.raderRegisters < axes[k].specializationConstants.rader_min_registers) axes[k].specializationConstants.raderRegisters = axes[k].specializationConstants.rader_min_registers; } //final check up on all registers, increase if bigger registers_per_thread = 0; min_registers_per_thread = -1; if (axes[k].specializationConstants.useRaderMult) { registers_per_thread = axes[k].specializationConstants.raderRegisters; min_registers_per_thread = axes[k].specializationConstants.rader_min_registers; } res = VkFFTMinMaxRegisterCheck(axes[k].specializationConstants.numStages, axes[k].specializationConstants.stageRadix, &min_registers_per_thread, ®isters_per_thread, axes[k].specializationConstants.registers_per_thread_per_radix, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, axes[k].specializationConstants.rader_generator);; if (res != VKFFT_SUCCESS) return res; axes[k].specializationConstants.minRaderFFTThreadNum = 0; res = VkFFTGetRaderFFTThreadsNum(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, &axes[k].specializationConstants.minRaderFFTThreadNum); if (res != VKFFT_SUCCESS) return res; axes[k].specializationConstants.registerBoost = registerBoost; axes[k].specializationConstants.registers_per_thread = registers_per_thread; axes[k].specializationConstants.min_registers_per_thread = min_registers_per_thread; if (switchRegisterBoost > 0) { axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages] = switchRegisterBoost; axes[k].specializationConstants.numStages++; } else { //try to read directly to registers if (min_registers_per_thread != registers_per_thread) { for (uint64_t i = 0; i < axes[k].specializationConstants.numStages; i++) { if (axes[k].specializationConstants.registers_per_thread_per_radix[axes[k].specializationConstants.stageRadix[i]] == min_registers_per_thread) { uint64_t stageid = axes[k].specializationConstants.stageRadix[i]; axes[k].specializationConstants.stageRadix[i] = axes[k].specializationConstants.stageRadix[0]; axes[k].specializationConstants.stageRadix[0] = stageid; if (axes[k].specializationConstants.useRader) { stageid = axes[k].specializationConstants.rader_generator[i]; axes[k].specializationConstants.rader_generator[i] = axes[k].specializationConstants.rader_generator[0]; axes[k].specializationConstants.rader_generator[0] = stageid; } i = axes[k].specializationConstants.numStages; } } } } } return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id) { //generate two arrays used for Blueestein convolution and post-convolution multiplication VkFFTResult resFFT = VKFFT_SUCCESS; uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) bufferSize *= sizeof(double) / sizeof(float); app->bufferBluesteinSize[axis_id] = bufferSize; #if(VKFFT_BACKEND==0) VkResult res = VK_SUCCESS; resFFT = allocateFFTBuffer(app, &app->bufferBluestein[axis_id], &app->bufferBluesteinDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; if (!app->configuration.makeInversePlanOnly) { resFFT = allocateFFTBuffer(app, &app->bufferBluesteinFFT[axis_id], &app->bufferBluesteinFFTDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; } if (!app->configuration.makeForwardPlanOnly) { resFFT = allocateFFTBuffer(app, &app->bufferBluesteinIFFT[axis_id], &app->bufferBluesteinIFFTDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; } #elif(VKFFT_BACKEND==1) cudaError_t res = cudaSuccess; res = cudaMalloc((void**)&app->bufferBluestein[axis_id], bufferSize); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; if (!app->configuration.makeInversePlanOnly) { res = cudaMalloc((void**)&app->bufferBluesteinFFT[axis_id], bufferSize); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } if (!app->configuration.makeForwardPlanOnly) { res = cudaMalloc((void**)&app->bufferBluesteinIFFT[axis_id], bufferSize); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; res = hipMalloc((void**)&app->bufferBluestein[axis_id], bufferSize); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; if (!app->configuration.makeInversePlanOnly) { res = hipMalloc((void**)&app->bufferBluesteinFFT[axis_id], bufferSize); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } if (!app->configuration.makeForwardPlanOnly) { res = hipMalloc((void**)&app->bufferBluesteinIFFT[axis_id], bufferSize); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==3) cl_int res = CL_SUCCESS; app->bufferBluestein[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; if (!app->configuration.makeInversePlanOnly) { app->bufferBluesteinFFT[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } if (!app->configuration.makeForwardPlanOnly) { app->bufferBluesteinIFFT[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } cl_command_queue commandQueue = clCreateCommandQueue(app->configuration.context[0], app->configuration.device[0], 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE; #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &app->bufferBluestein[axis_id]); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; if (!app->configuration.makeInversePlanOnly) { res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &app->bufferBluesteinFFT[axis_id]); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } if (!app->configuration.makeForwardPlanOnly) { res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &app->bufferBluesteinIFFT[axis_id]); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==5) app->bufferBluestein[axis_id] = app->configuration.device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate); if (!app->configuration.makeInversePlanOnly) { app->bufferBluesteinFFT[axis_id] = app->configuration.device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate); } if (!app->configuration.makeForwardPlanOnly) { app->bufferBluesteinIFFT[axis_id] = app->configuration.device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate); } #endif #ifdef VkFFT_use_FP128_Bluestein_RaderFFT if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { double* phaseVectors_fp64 = (double*)malloc(bufferSize); if (!phaseVectors_fp64) { return VKFFT_ERROR_MALLOC_FAILED; } long double* phaseVectors_fp128 = (long double*)malloc(2 * bufferSize); if (!phaseVectors_fp128) { free(phaseVectors_fp64); return VKFFT_ERROR_MALLOC_FAILED; } long double* phaseVectors_fp128_out = (long double*)malloc(2 * bufferSize); if (!phaseVectors_fp128) { free(phaseVectors_fp64); free(phaseVectors_fp128); return VKFFT_ERROR_MALLOC_FAILED; } uint64_t phaseVectorsNonZeroSize = (((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) || ((FFTPlan->multiUploadR2C) && (axis_id == 0))) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id]; if (app->configuration.performDCT == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] - 2; long double double_PI = 3.14159265358979323846264338327950288419716939937510L; for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); long double angle = double_PI * rm / phaseVectorsNonZeroSize; phaseVectors_fp128[2 * i] = (i < phaseVectorsNonZeroSize) ? cos(angle) : 0; phaseVectors_fp128[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? -sin(angle) : 0; } for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { phaseVectors_fp128[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_fp128[2 * i]; phaseVectors_fp128[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_fp128[2 * i + 1]; } if ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->configuration.makeForwardPlanOnly)) { fftwl_plan p; p = fftwl_plan_dft_1d((int)(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]), (fftwl_complex*)phaseVectors_fp128, (fftwl_complex*)phaseVectors_fp128_out, -1, FFTW_ESTIMATE); fftwl_execute(p); fftwl_destroy_plan(p); for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { uint64_t out = 0; if (FFTPlan->numAxisUploads[axis_id] == 1) { out = i; } else if (FFTPlan->numAxisUploads[axis_id] == 2) { out = i / FFTPlan->axisSplit[axis_id][1] + (i % FFTPlan->axisSplit[axis_id][1]) * FFTPlan->axisSplit[axis_id][0]; } else { out = (i / FFTPlan->axisSplit[axis_id][2]) / FFTPlan->axisSplit[axis_id][1] + ((i / FFTPlan->axisSplit[axis_id][2]) % FFTPlan->axisSplit[axis_id][1]) * FFTPlan->axisSplit[axis_id][0] + (i % FFTPlan->axisSplit[axis_id][2]) * FFTPlan->axisSplit[axis_id][1] * FFTPlan->axisSplit[axis_id][0]; } phaseVectors_fp64[2 * out] = (double)phaseVectors_fp128_out[2 * i]; phaseVectors_fp64[2 * out + 1] = (double)phaseVectors_fp128_out[2 * i + 1]; } resFFT = VkFFT_transferDataFromCPU(app, phaseVectors_fp64, &app->bufferBluesteinIFFT[axis_id], bufferSize); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors_fp64); free(phaseVectors_fp128); free(phaseVectors_fp128_out); return resFFT; } } for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { phaseVectors_fp128[2 * i + 1] = -phaseVectors_fp128[2 * i + 1]; } for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { phaseVectors_fp64[2 * i] = (double)phaseVectors_fp128[2 * i]; phaseVectors_fp64[2 * i + 1] = (double)phaseVectors_fp128[2 * i + 1]; } resFFT = VkFFT_transferDataFromCPU(app, phaseVectors_fp64, &app->bufferBluestein[axis_id], bufferSize); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors_fp64); free(phaseVectors_fp128); free(phaseVectors_fp128_out); return resFFT; } if (!app->configuration.makeInversePlanOnly) { fftwl_plan p; p = fftwl_plan_dft_1d((int)(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]), (fftwl_complex*)phaseVectors_fp128, (fftwl_complex*)phaseVectors_fp128_out, -1, FFTW_ESTIMATE); fftwl_execute(p); fftwl_destroy_plan(p); for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { uint64_t out = 0; if (FFTPlan->numAxisUploads[axis_id] == 1) { out = i; } else if (FFTPlan->numAxisUploads[axis_id] == 2) { out = i / FFTPlan->axisSplit[axis_id][1] + (i % FFTPlan->axisSplit[axis_id][1]) * FFTPlan->axisSplit[axis_id][0]; } else { out = (i / FFTPlan->axisSplit[axis_id][2]) / FFTPlan->axisSplit[axis_id][1] + ((i / FFTPlan->axisSplit[axis_id][2]) % FFTPlan->axisSplit[axis_id][1]) * FFTPlan->axisSplit[axis_id][0] + (i % FFTPlan->axisSplit[axis_id][2]) * FFTPlan->axisSplit[axis_id][1] * FFTPlan->axisSplit[axis_id][0]; } phaseVectors_fp64[2 * out] = (double)phaseVectors_fp128_out[2 * i]; phaseVectors_fp64[2 * out + 1] = (double)phaseVectors_fp128_out[2 * i + 1]; } resFFT = VkFFT_transferDataFromCPU(app, phaseVectors_fp64, &app->bufferBluesteinFFT[axis_id], bufferSize); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors_fp64); free(phaseVectors_fp128); free(phaseVectors_fp128_out); return resFFT; } } if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { fftwl_plan p; p = fftwl_plan_dft_1d((int)(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]), (fftwl_complex*)phaseVectors_fp128, (fftwl_complex*)phaseVectors_fp128_out, 1, FFTW_ESTIMATE); fftwl_execute(p); fftwl_destroy_plan(p); for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { phaseVectors_fp64[2 * i] = (double)phaseVectors_fp128_out[2 * i]; phaseVectors_fp64[2 * i + 1] = (double)phaseVectors_fp128_out[2 * i + 1]; } resFFT = VkFFT_transferDataFromCPU(app, phaseVectors_fp64, &app->bufferBluesteinIFFT[axis_id], bufferSize); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors_fp64); free(phaseVectors_fp128); free(phaseVectors_fp128_out); return resFFT; } } free(phaseVectors_fp64); free(phaseVectors_fp128); free(phaseVectors_fp128_out); } else { #endif VkFFTApplication kernelPreparationApplication = {}; VkFFTConfiguration kernelPreparationConfiguration = {}; kernelPreparationConfiguration.FFTdim = 1; kernelPreparationConfiguration.size[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; kernelPreparationConfiguration.size[1] = 1; kernelPreparationConfiguration.size[2] = 1; kernelPreparationConfiguration.doublePrecision = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory); kernelPreparationConfiguration.useLUT = 1; kernelPreparationConfiguration.registerBoost = 1; kernelPreparationConfiguration.disableReorderFourStep = 1; kernelPreparationConfiguration.fixMinRaderPrimeFFT = 17; kernelPreparationConfiguration.fixMinRaderPrimeMult = 17; kernelPreparationConfiguration.fixMaxRaderPrimeFFT = 17; kernelPreparationConfiguration.fixMaxRaderPrimeMult = 17; kernelPreparationConfiguration.saveApplicationToString = app->configuration.saveApplicationToString; kernelPreparationConfiguration.loadApplicationFromString = app->configuration.loadApplicationFromString; if (kernelPreparationConfiguration.loadApplicationFromString) { kernelPreparationConfiguration.loadApplicationString = (void*)((char*)app->configuration.loadApplicationString + app->currentApplicationStringPos); } kernelPreparationConfiguration.performBandwidthBoost = (app->configuration.performBandwidthBoost > 0) ? app->configuration.performBandwidthBoost : 1; if (axis_id == 0) kernelPreparationConfiguration.performBandwidthBoost = 0; if (axis_id > 0) kernelPreparationConfiguration.considerAllAxesStrided = 1; if (app->configuration.tempBuffer) { kernelPreparationConfiguration.userTempBuffer = 1; kernelPreparationConfiguration.tempBuffer = app->configuration.tempBuffer; kernelPreparationConfiguration.tempBufferSize = app->configuration.tempBufferSize; kernelPreparationConfiguration.tempBufferNum = app->configuration.tempBufferNum; } kernelPreparationConfiguration.device = app->configuration.device; #if(VKFFT_BACKEND==0) kernelPreparationConfiguration.queue = app->configuration.queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers kernelPreparationConfiguration.fence = app->configuration.fence; kernelPreparationConfiguration.commandPool = app->configuration.commandPool; kernelPreparationConfiguration.physicalDevice = app->configuration.physicalDevice; kernelPreparationConfiguration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization kernelPreparationConfiguration.tempBufferDeviceMemory = app->configuration.tempBufferDeviceMemory; #elif(VKFFT_BACKEND==3) kernelPreparationConfiguration.context = app->configuration.context; #elif(VKFFT_BACKEND==4) kernelPreparationConfiguration.context = app->configuration.context; kernelPreparationConfiguration.commandQueue = app->configuration.commandQueue; kernelPreparationConfiguration.commandQueueID = app->configuration.commandQueueID; #elif(VKFFT_BACKEND==5) kernelPreparationConfiguration.device = app->configuration.device; kernelPreparationConfiguration.queue = app->configuration.queue; #endif kernelPreparationConfiguration.inputBufferSize = &app->bufferBluesteinSize[axis_id]; kernelPreparationConfiguration.bufferSize = &app->bufferBluesteinSize[axis_id]; kernelPreparationConfiguration.isInputFormatted = 1; resFFT = initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration); if (resFFT != VKFFT_SUCCESS) return resFFT; if (kernelPreparationConfiguration.loadApplicationFromString) { app->currentApplicationStringPos += kernelPreparationApplication.currentApplicationStringPos; } void* phaseVectors = malloc(bufferSize); if (!phaseVectors) { deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_MALLOC_FAILED; } uint64_t phaseVectorsNonZeroSize = (((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) || ((FFTPlan->multiUploadR2C) && (axis_id == 0))) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id]; if (app->configuration.performDCT == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] - 2; if ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->configuration.makeForwardPlanOnly)) { if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; double* phaseVectors_cast = (double*)phaseVectors; for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); long double angle = double_PI * rm / phaseVectorsNonZeroSize; phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0; phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)-sin(angle) : 0; } for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; } } else { double double_PI = 3.14159265358979323846264338327950288419716939937510; float* phaseVectors_cast = (float*)phaseVectors; for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); double angle = double_PI * rm / phaseVectorsNonZeroSize; phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0; phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)-sin(angle) : 0; } for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; } } resFFT = VkFFT_transferDataFromCPU(app, phaseVectors, &app->bufferBluestein[axis_id], bufferSize); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } #if(VKFFT_BACKEND==0) { VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkCommandBuffer commandBuffer = {}; res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; } VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; } VkFFTLaunchParams launchParams = {}; launchParams.commandBuffer = &commandBuffer; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; //Record commands resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = vkEndCommandBuffer(commandBuffer); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &commandBuffer; res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; } res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_RESET_FENCES; } vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); } #elif(VKFFT_BACKEND==1) VkFFTLaunchParams launchParams = {}; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = cudaDeviceSynchronize(); if (res != cudaSuccess) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==2) VkFFTLaunchParams launchParams = {}; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = hipDeviceSynchronize(); if (res != hipSuccess) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==3) VkFFTLaunchParams launchParams = {}; launchParams.commandQueue = &commandQueue; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = clFinish(commandQueue); if (res != CL_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==4) ze_command_list_desc_t commandListDescription = {}; commandListDescription.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; ze_command_list_handle_t commandList = {}; res = zeCommandListCreate(app->configuration.context[0], app->configuration.device[0], &commandListDescription, &commandList); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; VkFFTLaunchParams launchParams = {}; launchParams.commandList = &commandList; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = zeCommandListClose(commandList); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } res = zeCommandListDestroy(commandList); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; } #elif(VKFFT_BACKEND==5) VkFFTLaunchParams launchParams = {}; MTL::CommandBuffer* commandBuffer = app->configuration.queue->commandBuffer(); if (commandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::ComputeCommandEncoder* commandEncoder = commandBuffer->computeCommandEncoder(); if (commandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; launchParams.commandBuffer = commandBuffer; launchParams.commandEncoder = commandEncoder; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } commandEncoder->endEncoding(); commandBuffer->commit(); commandBuffer->waitUntilCompleted(); commandEncoder->release(); commandBuffer->release(); #endif } if ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->configuration.makeForwardPlanOnly)) { if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { double* phaseVectors_cast = (double*)phaseVectors; for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { phaseVectors_cast[2 * i + 1] = -phaseVectors_cast[2 * i + 1]; } } else { float* phaseVectors_cast = (float*)phaseVectors; for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { phaseVectors_cast[2 * i + 1] = -phaseVectors_cast[2 * i + 1]; } } } else { if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; double* phaseVectors_cast = (double*)phaseVectors; for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); long double angle = double_PI * rm / phaseVectorsNonZeroSize; phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0; phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)sin(angle) : 0; } for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; } } else { double double_PI = 3.14159265358979323846264338327950288419716939937510; float* phaseVectors_cast = (float*)phaseVectors; for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); double angle = double_PI * rm / phaseVectorsNonZeroSize; phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0; phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)sin(angle) : 0; } for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; } } } resFFT = VkFFT_transferDataFromCPU(app, phaseVectors, &app->bufferBluestein[axis_id], bufferSize); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } #if(VKFFT_BACKEND==0) if (!app->configuration.makeInversePlanOnly) { VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkCommandBuffer commandBuffer = {}; res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; } VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; } VkFFTLaunchParams launchParams = {}; launchParams.commandBuffer = &commandBuffer; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; //Record commands resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = vkEndCommandBuffer(commandBuffer); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &commandBuffer; res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; } res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_RESET_FENCES; } vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); } if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkCommandBuffer commandBuffer = {}; res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; } VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; } VkFFTLaunchParams launchParams = {}; launchParams.commandBuffer = &commandBuffer; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; //Record commands resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = vkEndCommandBuffer(commandBuffer); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &commandBuffer; res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; } res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); if (res != 0) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_RESET_FENCES; } vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); } #elif(VKFFT_BACKEND==1) VkFFTLaunchParams launchParams = {}; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; if (!app->configuration.makeInversePlanOnly) { launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = cudaDeviceSynchronize(); if (res != cudaSuccess) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } } if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = cudaDeviceSynchronize(); if (res != cudaSuccess) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } } #elif(VKFFT_BACKEND==2) VkFFTLaunchParams launchParams = {}; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; if (!app->configuration.makeInversePlanOnly) { launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = hipDeviceSynchronize(); if (res != hipSuccess) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } } if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = hipDeviceSynchronize(); if (res != hipSuccess) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } } #elif(VKFFT_BACKEND==3) VkFFTLaunchParams launchParams = {}; launchParams.commandQueue = &commandQueue; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; if (!app->configuration.makeInversePlanOnly) { launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = clFinish(commandQueue); if (res != CL_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } } if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = clFinish(commandQueue); if (res != CL_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } } #elif(VKFFT_BACKEND==4) ze_command_list_desc_t commandListDescription = {}; commandListDescription.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; ze_command_list_handle_t commandList = {}; res = zeCommandListCreate(app->configuration.context[0], app->configuration.device[0], &commandListDescription, &commandList); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; VkFFTLaunchParams launchParams = {}; launchParams.commandList = &commandList; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; if (!app->configuration.makeInversePlanOnly) { launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = zeCommandListClose(commandList); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } res = zeCommandListReset(commandList); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; } } if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = zeCommandListClose(commandList); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } } res = zeCommandListDestroy(commandList); if (res != ZE_RESULT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; } #elif(VKFFT_BACKEND==5) VkFFTLaunchParams launchParams = {}; launchParams.inputBuffer = &app->bufferBluestein[axis_id]; if (!app->configuration.makeInversePlanOnly) { MTL::CommandBuffer* commandBuffer = app->configuration.queue->commandBuffer(); if (commandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::ComputeCommandEncoder* commandEncoder = commandBuffer->computeCommandEncoder(); if (commandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; launchParams.commandBuffer = commandBuffer; launchParams.commandEncoder = commandEncoder; launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } commandEncoder->endEncoding(); commandBuffer->commit(); commandBuffer->waitUntilCompleted(); commandEncoder->release(); commandBuffer->release(); } if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { MTL::CommandBuffer* commandBuffer = app->configuration.queue->commandBuffer(); if (commandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::ComputeCommandEncoder* commandEncoder = commandBuffer->computeCommandEncoder(); if (commandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; launchParams.commandBuffer = commandBuffer; launchParams.commandEncoder = commandEncoder; launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(phaseVectors); deleteVkFFT(&kernelPreparationApplication); return resFFT; } commandEncoder->endEncoding(); commandBuffer->commit(); commandBuffer->waitUntilCompleted(); commandEncoder->release(); commandBuffer->release(); } #endif #if(VKFFT_BACKEND==0) kernelPreparationApplication.configuration.isCompilerInitialized = 0; #elif(VKFFT_BACKEND==3) res = clReleaseCommandQueue(commandQueue); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE; #endif if (kernelPreparationConfiguration.saveApplicationToString) { app->applicationBluesteinStringSize[axis_id] = kernelPreparationApplication.applicationStringSize; app->applicationBluesteinString[axis_id] = calloc(app->applicationBluesteinStringSize[axis_id], 1); if (!app->applicationBluesteinString[axis_id]) { deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(app->applicationBluesteinString[axis_id], kernelPreparationApplication.saveApplicationString, app->applicationBluesteinStringSize[axis_id]); } deleteVkFFT(&kernelPreparationApplication); free(phaseVectors); #ifdef VkFFT_use_FP128_Bluestein_RaderFFT } #endif return resFFT; } static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkFFTAxis* axis) { //generate Rader FFTKernel VkFFTResult resFFT = VKFFT_SUCCESS; if (axis->specializationConstants.useRader) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 0) { for (uint64_t j = 0; j < app->numRaderFFTPrimes; j++) { if (app->rader_primes[j] == axis->specializationConstants.raderContainer[i].prime) { axis->specializationConstants.raderContainer[i].raderFFTkernel = app->raderFFTkernel[j]; } } if (axis->specializationConstants.raderContainer[i].raderFFTkernel) continue; uint64_t write_id = app->numRaderFFTPrimes; app->rader_primes[write_id] = axis->specializationConstants.raderContainer[i].prime; app->numRaderFFTPrimes++; if (app->configuration.loadApplicationFromString) continue; #ifdef VkFFT_use_FP128_Bluestein_RaderFFT if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; double* raderFFTkernel = (double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2); if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED; axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel; app->raderFFTkernel[write_id] = (void*)raderFFTkernel; app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2; long double* raderFFTkernel_temp = (long double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(long double) * 2); if (!raderFFTkernel_temp) return VKFFT_ERROR_MALLOC_FAILED; for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later uint64_t g_pow = 1; for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; } raderFFTkernel_temp[2 * j] = cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); raderFFTkernel_temp[2 * j + 1] = -sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); } fftwl_plan p; p = fftwl_plan_dft_1d((int)(axis->specializationConstants.raderContainer[i].prime - 1), (fftwl_complex*)raderFFTkernel_temp, (fftwl_complex*)raderFFTkernel_temp, -1, FFTW_ESTIMATE); fftwl_execute(p); fftwl_destroy_plan(p); for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later raderFFTkernel[2 * j] = (double)raderFFTkernel_temp[2 * j]; raderFFTkernel[2 * j + 1] = (double)raderFFTkernel_temp[2 * j + 1]; } free(raderFFTkernel_temp); continue; } #endif if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; double* raderFFTkernel = (double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2); if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED; axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel; app->raderFFTkernel[write_id] = (void*)raderFFTkernel; app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2; for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later uint64_t g_pow = 1; for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; } raderFFTkernel[2 * j] = (double)cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); raderFFTkernel[2 * j + 1] = (double)-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); } } else { double double_PI = 3.14159265358979323846264338327950288419716939937510; float* raderFFTkernel = (float*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(float) * 2); if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED; axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel; app->raderFFTkernel[write_id] = (void*)raderFFTkernel; app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(float) * 2; for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later uint64_t g_pow = 1; for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; } raderFFTkernel[2 * j] = (float)cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); raderFFTkernel[2 * j + 1] = (float)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); } } VkFFTApplication kernelPreparationApplication = {}; VkFFTConfiguration kernelPreparationConfiguration = {}; kernelPreparationConfiguration.FFTdim = 1; kernelPreparationConfiguration.size[0] = axis->specializationConstants.raderContainer[i].prime - 1; kernelPreparationConfiguration.size[1] = 1; kernelPreparationConfiguration.size[2] = 1; kernelPreparationConfiguration.doublePrecision = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory); kernelPreparationConfiguration.useLUT = 1; kernelPreparationConfiguration.fixMinRaderPrimeFFT = 17; kernelPreparationConfiguration.fixMinRaderPrimeMult = 17; kernelPreparationConfiguration.fixMaxRaderPrimeFFT = 17; kernelPreparationConfiguration.fixMaxRaderPrimeMult = 17; kernelPreparationConfiguration.device = app->configuration.device; #if(VKFFT_BACKEND==0) kernelPreparationConfiguration.queue = app->configuration.queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers kernelPreparationConfiguration.fence = app->configuration.fence; kernelPreparationConfiguration.commandPool = app->configuration.commandPool; kernelPreparationConfiguration.physicalDevice = app->configuration.physicalDevice; kernelPreparationConfiguration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization kernelPreparationConfiguration.tempBufferDeviceMemory = app->configuration.tempBufferDeviceMemory; #elif(VKFFT_BACKEND==3) kernelPreparationConfiguration.context = app->configuration.context; #elif(VKFFT_BACKEND==4) kernelPreparationConfiguration.context = app->configuration.context; kernelPreparationConfiguration.commandQueue = app->configuration.commandQueue; kernelPreparationConfiguration.commandQueueID = app->configuration.commandQueueID; #elif(VKFFT_BACKEND==5) kernelPreparationConfiguration.device = app->configuration.device; kernelPreparationConfiguration.queue = app->configuration.queue; #endif uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * kernelPreparationConfiguration.size[0] * kernelPreparationConfiguration.size[1] * kernelPreparationConfiguration.size[2]; if (kernelPreparationConfiguration.doublePrecision) bufferSize *= sizeof(double) / sizeof(float); kernelPreparationConfiguration.bufferSize = &bufferSize; resFFT = initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) VkDeviceMemory bufferRaderFFTDeviceMemory; VkBuffer bufferRaderFFT; #elif(VKFFT_BACKEND==1) void* bufferRaderFFT; #elif(VKFFT_BACKEND==2) void* bufferRaderFFT; #elif(VKFFT_BACKEND==3) cl_mem bufferRaderFFT; #elif(VKFFT_BACKEND==4) void* bufferRaderFFT; #elif(VKFFT_BACKEND==5) MTL::Buffer* bufferRaderFFT; #endif #if(VKFFT_BACKEND==0) VkResult res = VK_SUCCESS; resFFT = allocateFFTBuffer(app, &bufferRaderFFT, &bufferRaderFFTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; #elif(VKFFT_BACKEND==1) cudaError_t res = cudaSuccess; res = cudaMalloc(&bufferRaderFFT, bufferSize); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; res = hipMalloc(&bufferRaderFFT, bufferSize); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; #elif(VKFFT_BACKEND==3) cl_int res = CL_SUCCESS; bufferRaderFFT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; cl_command_queue commandQueue = clCreateCommandQueue(app->configuration.context[0], app->configuration.device[0], 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE; #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &bufferRaderFFT); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; #elif(VKFFT_BACKEND==5) bufferRaderFFT = app->configuration.device->newBuffer(bufferSize, MTL::ResourceStorageModePrivate); #endif resFFT = VkFFT_transferDataFromCPU(app, axis->specializationConstants.raderContainer[i].raderFFTkernel, &bufferRaderFFT, bufferSize); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } #if(VKFFT_BACKEND==0) { VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; commandBufferAllocateInfo.commandBufferCount = 1; VkCommandBuffer commandBuffer = {}; res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); if (res != 0) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; } VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); if (res != 0) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; } VkFFTLaunchParams launchParams = {}; launchParams.commandBuffer = &commandBuffer; launchParams.buffer = &bufferRaderFFT; //Record commands resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = vkEndCommandBuffer(commandBuffer); if (res != 0) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; submitInfo.commandBufferCount = 1; submitInfo.pCommandBuffers = &commandBuffer; res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); if (res != 0) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); if (res != 0) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; } res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); if (res != 0) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_RESET_FENCES; } vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); } #elif(VKFFT_BACKEND==1) VkFFTLaunchParams launchParams = {}; launchParams.buffer = &bufferRaderFFT; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = cudaDeviceSynchronize(); if (res != cudaSuccess) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==2) VkFFTLaunchParams launchParams = {}; launchParams.buffer = &bufferRaderFFT; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = hipDeviceSynchronize(); if (res != hipSuccess) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==3) VkFFTLaunchParams launchParams = {}; launchParams.commandQueue = &commandQueue; launchParams.buffer = &bufferRaderFFT; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = clFinish(commandQueue); if (res != CL_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } #elif(VKFFT_BACKEND==4) ze_command_list_desc_t commandListDescription = {}; commandListDescription.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; ze_command_list_handle_t commandList = {}; res = zeCommandListCreate(app->configuration.context[0], app->configuration.device[0], &commandListDescription, &commandList); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; VkFFTLaunchParams launchParams = {}; launchParams.commandList = &commandList; launchParams.buffer = &bufferRaderFFT; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } res = zeCommandListClose(commandList); if (res != ZE_RESULT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; } res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); if (res != ZE_RESULT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; } res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); if (res != ZE_RESULT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } res = zeCommandListDestroy(commandList); if (res != ZE_RESULT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; } #elif(VKFFT_BACKEND==5) VkFFTLaunchParams launchParams = {}; MTL::CommandBuffer* commandBuffer = app->configuration.queue->commandBuffer(); if (commandBuffer == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; MTL::ComputeCommandEncoder* commandEncoder = commandBuffer->computeCommandEncoder(); if (commandEncoder == 0) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; launchParams.commandBuffer = commandBuffer; launchParams.commandEncoder = commandEncoder; launchParams.buffer = &bufferRaderFFT; resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } commandEncoder->endEncoding(); commandBuffer->commit(); commandBuffer->waitUntilCompleted(); commandEncoder->release(); commandBuffer->release(); #endif resFFT = VkFFT_transferDataToCPU(&kernelPreparationApplication, axis->specializationConstants.raderContainer[i].raderFFTkernel, &bufferRaderFFT, bufferSize); if (resFFT != VKFFT_SUCCESS) { free(axis->specializationConstants.raderContainer[i].raderFFTkernel); deleteVkFFT(&kernelPreparationApplication); return resFFT; } #if(VKFFT_BACKEND==0) kernelPreparationApplication.configuration.isCompilerInitialized = 0; #elif(VKFFT_BACKEND==3) res = clReleaseCommandQueue(commandQueue); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE; #endif #if(VKFFT_BACKEND==0) vkDestroyBuffer(app->configuration.device[0], bufferRaderFFT, 0); vkFreeMemory(app->configuration.device[0], bufferRaderFFTDeviceMemory, 0); #elif(VKFFT_BACKEND==1) cudaFree(bufferRaderFFT); #elif(VKFFT_BACKEND==2) hipFree(bufferRaderFFT); #elif(VKFFT_BACKEND==3) clReleaseMemObject(bufferRaderFFT); #elif(VKFFT_BACKEND==4) zeMemFree(app->configuration.context[0], bufferRaderFFT); #elif(VKFFT_BACKEND==5) bufferRaderFFT->release(); #endif deleteVkFFT(&kernelPreparationApplication); } } if (app->configuration.loadApplicationFromString) { uint64_t offset = 0; for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { uint64_t current_size = 0; if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { current_size = (app->rader_primes[i] - 1) * sizeof(double) * 2; } else { current_size = (app->rader_primes[i] - 1) * sizeof(float) * 2; } if (!app->raderFFTkernel[i]) { app->raderFFTkernel[i] = (void*)malloc(current_size); if (!app->raderFFTkernel[i]) return VKFFT_ERROR_MALLOC_FAILED; memcpy(app->raderFFTkernel[i], (char*)app->configuration.loadApplicationString + app->applicationStringOffsetRader + offset, current_size); } for (uint64_t j = 0; j < axis->specializationConstants.numRaderPrimes; j++) { if ((app->rader_primes[i] == axis->specializationConstants.raderContainer[j].prime) && (axis->specializationConstants.raderContainer[j].type == 0)) axis->specializationConstants.raderContainer[j].raderFFTkernel = app->raderFFTkernel[i]; } offset += current_size; } } } return resFFT; } static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFTAxis* axis, uint64_t planStage, VkFFTLaunchParams* launchParams) { uint64_t performBufferSetUpdate = planStage; uint64_t performOffsetUpdate = planStage; if (!planStage) { if (launchParams != 0) { if ((launchParams->buffer != 0) && (app->configuration.buffer != launchParams->buffer)) { app->configuration.buffer = launchParams->buffer; performBufferSetUpdate = 1; } if ((launchParams->inputBuffer != 0) && (app->configuration.inputBuffer != launchParams->inputBuffer)) { app->configuration.inputBuffer = launchParams->inputBuffer; performBufferSetUpdate = 1; } if ((launchParams->outputBuffer != 0) && (app->configuration.outputBuffer != launchParams->outputBuffer)) { app->configuration.outputBuffer = launchParams->outputBuffer; performBufferSetUpdate = 1; } if ((launchParams->tempBuffer != 0) && (app->configuration.tempBuffer != launchParams->tempBuffer)) { app->configuration.tempBuffer = launchParams->tempBuffer; performBufferSetUpdate = 1; } if ((launchParams->kernel != 0) && (app->configuration.kernel != launchParams->kernel)) { app->configuration.kernel = launchParams->kernel; performBufferSetUpdate = 1; } if (app->configuration.inputBuffer == 0) app->configuration.inputBuffer = app->configuration.buffer; if (app->configuration.outputBuffer == 0) app->configuration.outputBuffer = app->configuration.buffer; if (app->configuration.bufferOffset != launchParams->bufferOffset) { app->configuration.bufferOffset = launchParams->bufferOffset; performOffsetUpdate = 1; } if (app->configuration.inputBufferOffset != launchParams->inputBufferOffset) { app->configuration.inputBufferOffset = launchParams->inputBufferOffset; performOffsetUpdate = 1; } if (app->configuration.outputBufferOffset != launchParams->outputBufferOffset) { app->configuration.outputBufferOffset = launchParams->outputBufferOffset; performOffsetUpdate = 1; } if (app->configuration.tempBufferOffset != launchParams->tempBufferOffset) { app->configuration.tempBufferOffset = launchParams->tempBufferOffset; performOffsetUpdate = 1; } if (app->configuration.kernelOffset != launchParams->kernelOffset) { app->configuration.kernelOffset = launchParams->kernelOffset; performOffsetUpdate = 1; } } } if (planStage) { if (app->configuration.buffer == 0) { performBufferSetUpdate = 0; } if ((app->configuration.isInputFormatted) && (app->configuration.inputBuffer == 0)) { performBufferSetUpdate = 0; } if ((app->configuration.isOutputFormatted) && (app->configuration.outputBuffer == 0)) { performBufferSetUpdate = 0; } if ((app->configuration.userTempBuffer) && (app->configuration.tempBuffer == 0)) { performBufferSetUpdate = 0; } if ((app->configuration.performConvolution) && (app->configuration.kernel == 0)) { performBufferSetUpdate = 0; } } else { if (app->configuration.buffer == 0) { return VKFFT_ERROR_EMPTY_buffer; } if ((app->configuration.isInputFormatted) && (app->configuration.inputBuffer == 0)) { return VKFFT_ERROR_EMPTY_inputBuffer; } if ((app->configuration.isOutputFormatted) && (app->configuration.outputBuffer == 0)) { return VKFFT_ERROR_EMPTY_outputBuffer; } if ((app->configuration.userTempBuffer) && (app->configuration.tempBuffer == 0)) { return VKFFT_ERROR_EMPTY_tempBuffer; } if ((app->configuration.performConvolution) && (app->configuration.kernel == 0)) { return VKFFT_ERROR_EMPTY_kernel; } } if (performBufferSetUpdate) { if (planStage) axis->specializationConstants.performBufferSetUpdate = 1; else { if (!app->configuration.makeInversePlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) app->localFFTPlan->axes[i][j].specializationConstants.performBufferSetUpdate = 1; if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) app->localFFTPlan->inverseBluesteinAxes[i][j].specializationConstants.performBufferSetUpdate = 1; } } if (app->localFFTPlan->multiUploadR2C) { app->localFFTPlan->R2Cdecomposition.specializationConstants.performBufferSetUpdate = 1; } } if (!app->configuration.makeForwardPlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) app->localFFTPlan_inverse->axes[i][j].specializationConstants.performBufferSetUpdate = 1; if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].specializationConstants.performBufferSetUpdate = 1; } } if (app->localFFTPlan_inverse->multiUploadR2C) { app->localFFTPlan_inverse->R2Cdecomposition.specializationConstants.performBufferSetUpdate = 1; } } } } if (performOffsetUpdate) { if (planStage) axis->specializationConstants.performOffsetUpdate = 1; else { if (!app->configuration.makeInversePlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) app->localFFTPlan->axes[i][j].specializationConstants.performOffsetUpdate = 1; if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) app->localFFTPlan->inverseBluesteinAxes[i][j].specializationConstants.performOffsetUpdate = 1; } } if (app->localFFTPlan->multiUploadR2C) { app->localFFTPlan->R2Cdecomposition.specializationConstants.performOffsetUpdate = 1; } } if (!app->configuration.makeForwardPlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) app->localFFTPlan_inverse->axes[i][j].specializationConstants.performOffsetUpdate = 1; if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].specializationConstants.performOffsetUpdate = 1; } } if (app->localFFTPlan_inverse->multiUploadR2C) { app->localFFTPlan_inverse->R2Cdecomposition.specializationConstants.performOffsetUpdate = 1; } } } } return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) { if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) { #if(VKFFT_BACKEND==0) const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; #endif uint64_t storageComplexSize; if (app->configuration.doublePrecision) storageComplexSize = (2 * sizeof(double)); else if (app->configuration.halfPrecision) storageComplexSize = (2 * 2); else storageComplexSize = (2 * sizeof(float)); for (uint64_t i = 0; i < axis->numBindings; ++i) { for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) { #if(VKFFT_BACKEND==0) VkDescriptorBufferInfo descriptorBufferInfo = { 0 }; #endif if (i == 0) { if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( ((axis_id == app->firstAxis) && (!inverse)) || ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) ) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.inputBufferSize) { for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.inputBufferNum; } } } axis->inputBuffer = app->configuration.inputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.inputBufferOffset; } } else { if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.outputBufferSize) { for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.outputBufferNum; } } } axis->inputBuffer = app->configuration.outputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset; } } else { uint64_t bufferId = 0; uint64_t offset = j; if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { if ((((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) || (app->useBluesteinFFT[axis_id] && (axis->specializationConstants.reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload) && (axis->specializationConstants.reorderFourStep == 1) && (inverse == 1)))) { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->inputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.bufferOffset; } } else { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.tempBufferSize) { for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.tempBufferNum; } } } axis->inputBuffer = app->configuration.tempBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.tempBufferOffset; } } } else { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->inputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.bufferOffset; } } #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); } #endif } } //descriptorBufferInfo.offset = 0; } if (i == 1) { if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) )) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) )) || ((app->configuration.numberKernels > 1) && ( (inverse) || (axis_id == app->lastAxis))) ) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.outputBufferSize) { for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.outputBufferNum; } } } axis->outputBuffer = app->configuration.outputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset; } } else { uint64_t bufferId = 0; uint64_t offset = j; if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { if ((inverse) && (axis_id == app->firstAxis) && ( ((axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer) && (!app->useBluesteinFFT[axis_id])) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (axis->specializationConstants.actualInverse) && (app->configuration.inverseReturnToInputBuffer) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)))) ) { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.inputBufferSize) { for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.inputBufferNum; } } } axis->outputBuffer = app->configuration.inputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.inputBufferOffset; } } else { if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.tempBufferSize) { for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.tempBufferNum; } } } axis->outputBuffer = app->configuration.tempBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.tempBufferOffset; } } else { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->outputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.bufferOffset; } } } } else { if ((inverse) && (axis_id == app->firstAxis) && (axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer)) { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.inputBufferSize) { for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.inputBufferNum; } } } axis->outputBuffer = app->configuration.inputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.inputBufferOffset; } } else { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->outputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.bufferOffset; } } } #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); } #endif } //descriptorBufferInfo.offset = 0; } if ((i == axis->specializationConstants.convolutionBindingID) && (app->configuration.performConvolution)) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.kernelSize) { for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.kernelNum; } } } #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.kernel[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.kernelOffset = app->configuration.kernelOffset; } } if ((i == axis->specializationConstants.LUTBindingID) && (app->configuration.useLUT == 1)) { #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { descriptorBufferInfo.buffer = axis->bufferLUT; descriptorBufferInfo.offset = 0; descriptorBufferInfo.range = axis->bufferLUTSize; } #endif } if ((i == axis->specializationConstants.RaderUintLUTBindingID) && (axis->specializationConstants.raderUintLUT)) { #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { descriptorBufferInfo.buffer = axis->bufferRaderUintLUT; descriptorBufferInfo.offset = 0; descriptorBufferInfo.range = axis->bufferRaderUintLUTSize; } #endif } if ((i == axis->specializationConstants.BluesteinConvolutionBindingID) && (app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) { #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { if (axis->specializationConstants.inverseBluestein) descriptorBufferInfo.buffer = app->bufferBluesteinIFFT[axis_id]; else descriptorBufferInfo.buffer = app->bufferBluesteinFFT[axis_id]; descriptorBufferInfo.offset = 0; descriptorBufferInfo.range = app->bufferBluesteinSize[axis_id]; } #endif } if ((i == axis->specializationConstants.BluesteinMultiplicationBindingID) && (app->useBluesteinFFT[axis_id]) && (axis_upload_id == (FFTPlan->numAxisUploads[axis_id] - 1))) { #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { descriptorBufferInfo.buffer = app->bufferBluestein[axis_id]; descriptorBufferInfo.offset = 0; descriptorBufferInfo.range = app->bufferBluesteinSize[axis_id]; } #endif } #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { VkWriteDescriptorSet writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; writeDescriptorSet.dstSet = axis->descriptorSet; writeDescriptorSet.dstBinding = (uint32_t)i; writeDescriptorSet.dstArrayElement = (uint32_t)j; writeDescriptorSet.descriptorType = descriptorType; writeDescriptorSet.descriptorCount = 1; writeDescriptorSet.pBufferInfo = &descriptorBufferInfo; vkUpdateDescriptorSets(app->configuration.device[0], 1, &writeDescriptorSet, 0, 0); } #endif } } } if (axis->specializationConstants.performBufferSetUpdate) { axis->specializationConstants.performBufferSetUpdate = 0; } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.performOffsetUpdate = 0; } return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) { if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) { #if(VKFFT_BACKEND==0) const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; #endif uint64_t storageComplexSize; if (app->configuration.doublePrecision) storageComplexSize = (2 * sizeof(double)); else if (app->configuration.halfPrecision) storageComplexSize = (2 * 2); else storageComplexSize = (2 * sizeof(float)); for (uint64_t i = 0; i < axis->numBindings; ++i) { for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) { #if(VKFFT_BACKEND==0) VkDescriptorBufferInfo descriptorBufferInfo = { 0 }; #endif if (i == 0) { if (inverse) { if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( ((axis_id == app->firstAxis) && (!inverse)) || ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) ) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.inputBufferSize) { for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.inputBufferNum; } } } axis->inputBuffer = app->configuration.inputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.inputBufferOffset; } } else { if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.outputBufferSize) { for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.outputBufferNum; } } } axis->inputBuffer = app->configuration.outputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset; } } else { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->inputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.bufferOffset; } } } } else { if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) )) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) )) || ((app->configuration.numberKernels > 1) && ( (inverse) || (axis_id == app->lastAxis))) ) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.outputBufferSize) { for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.outputBufferNum; } } } axis->inputBuffer = app->configuration.outputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset; } } else { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->inputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.inputOffset = app->configuration.bufferOffset; } } } } if (i == 1) { if (inverse) { if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.outputBufferSize) { for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.outputBufferNum; } } } axis->outputBuffer = app->configuration.outputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset; } } else { uint64_t bufferId = 0; uint64_t offset = j; if (axis->specializationConstants.reorderFourStep == 1) { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.tempBufferSize) { for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.tempBufferNum; } } } axis->outputBuffer = app->configuration.tempBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.tempBufferOffset; } } else { if (axis->specializationConstants.performBufferSetUpdate) { if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->outputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.bufferOffset; } } } } else { if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) )) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) )) || ((app->configuration.numberKernels > 1) && ( (inverse) || (axis_id == app->lastAxis))) ) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.outputBufferSize) { for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.outputBufferNum; } } } axis->outputBuffer = app->configuration.outputBuffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset; } } else { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.bufferSize) { for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.bufferNum; } } } axis->outputBuffer = app->configuration.buffer; #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.outputOffset = app->configuration.bufferOffset; } } } } if ((i == 2) && (app->configuration.performConvolution)) { if (axis->specializationConstants.performBufferSetUpdate) { uint64_t bufferId = 0; uint64_t offset = j; if (app->configuration.kernelSize) { for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) { if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { bufferId++; offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); } else { l = app->configuration.kernelNum; } } } #if(VKFFT_BACKEND==0) descriptorBufferInfo.buffer = app->configuration.kernel[bufferId]; descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * storageComplexSize); descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * storageComplexSize); #endif } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.kernelOffset = app->configuration.kernelOffset; } } if ((i == axis->numBindings - 1) && (app->configuration.useLUT == 1)) { #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { descriptorBufferInfo.buffer = axis->bufferLUT; descriptorBufferInfo.offset = 0; descriptorBufferInfo.range = axis->bufferLUTSize; } #endif } #if(VKFFT_BACKEND==0) if (axis->specializationConstants.performBufferSetUpdate) { VkWriteDescriptorSet writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; writeDescriptorSet.dstSet = axis->descriptorSet; writeDescriptorSet.dstBinding = (uint32_t)i; writeDescriptorSet.dstArrayElement = (uint32_t)j; writeDescriptorSet.descriptorType = descriptorType; writeDescriptorSet.descriptorCount = 1; writeDescriptorSet.pBufferInfo = &descriptorBufferInfo; vkUpdateDescriptorSets(app->configuration.device[0], 1, &writeDescriptorSet, 0, 0); } #endif } } } if (axis->specializationConstants.performBufferSetUpdate) { axis->specializationConstants.performBufferSetUpdate = 0; } if (axis->specializationConstants.performOffsetUpdate) { axis->specializationConstants.performOffsetUpdate = 0; } return VKFFT_SUCCESS; } static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t inverse) { //get radix stages VkFFTResult resFFT = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) VkResult res = VK_SUCCESS; #elif(VKFFT_BACKEND==1) cudaError_t res = cudaSuccess; #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; #elif(VKFFT_BACKEND==3) cl_int res = CL_SUCCESS; #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; #elif(VKFFT_BACKEND==5) #endif VkFFTAxis* axis = &FFTPlan->R2Cdecomposition; axis->specializationConstants.warpSize = app->configuration.warpSize; axis->specializationConstants.numSharedBanks = app->configuration.numSharedBanks; axis->specializationConstants.useUint64 = app->configuration.useUint64; #if(VKFFT_BACKEND==2) axis->specializationConstants.useStrict32BitAddress = app->configuration.useStrict32BitAddress; #endif axis->specializationConstants.disableSetLocale = app->configuration.disableSetLocale; axis->specializationConstants.numAxisUploads = FFTPlan->numAxisUploads[0]; axis->specializationConstants.reorderFourStep = ((FFTPlan->numAxisUploads[0] > 1) && (!app->useBluesteinFFT[0])) ? app->configuration.reorderFourStep : 0; uint64_t complexSize; if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { axis->specializationConstants.precision = 1; complexSize = (2 * sizeof(double)); } else { if (app->configuration.halfPrecision) { axis->specializationConstants.precision = 0; complexSize = (2 * sizeof(float)); } else { axis->specializationConstants.precision = 0; complexSize = (2 * sizeof(float)); } } axis->specializationConstants.complexSize = complexSize; axis->specializationConstants.supportAxis = 0; axis->specializationConstants.symmetricKernel = app->configuration.symmetricKernel; axis->specializationConstants.conjugateConvolution = app->configuration.conjugateConvolution; axis->specializationConstants.crossPowerSpectrumNormalization = app->configuration.crossPowerSpectrumNormalization; axis->specializationConstants.fft_dim_full = app->configuration.size[0]; axis->specializationConstants.dispatchZactualFFTSize = 1; //allocate LUT if (app->configuration.useLUT == 1) { if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; axis->bufferLUTSize = (app->configuration.size[0] / 2) * 2 * sizeof(double); double* tempLUT = (double*)malloc(axis->bufferLUTSize); if (!tempLUT) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) { long double angle = double_PI * i / (app->configuration.size[0] / 2); tempLUT[2 * i] = (double)cos(angle); tempLUT[2 * i + 1] = (double)sin(angle); } axis->referenceLUT = 0; if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { axis->bufferLUT = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTDeviceMemory; #endif axis->bufferLUTSize = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTSize; axis->referenceLUT = 1; } else { #if(VKFFT_BACKEND==0) resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==1) res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != cudaSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==2) res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != hipSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==3) axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); if (res != CL_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==4) ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); if (res != ZE_RESULT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==5) axis->bufferLUT = app->configuration.device->newBuffer(axis->bufferLUTSize, MTL::ResourceStorageModePrivate); resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #endif free(tempLUT); tempLUT = 0; } } else { double double_PI = 3.14159265358979323846264338327950288419716939937510; axis->bufferLUTSize = (app->configuration.size[0] / 2) * 2 * sizeof(float); float* tempLUT = (float*)malloc(axis->bufferLUTSize); if (!tempLUT) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) { double angle = double_PI * i / (app->configuration.size[0] / 2); tempLUT[2 * i] = (float)cos(angle); tempLUT[2 * i + 1] = (float)sin(angle); } axis->referenceLUT = 0; if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { axis->bufferLUT = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTDeviceMemory; #endif axis->bufferLUTSize = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTSize; axis->referenceLUT = 1; } else { #if(VKFFT_BACKEND==0) resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==1) res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != cudaSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==2) res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != hipSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==3) axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); if (res != CL_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==4) ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); if (res != ZE_RESULT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==5) axis->bufferLUT = app->configuration.device->newBuffer(axis->bufferLUTSize, MTL::ResourceStorageModePrivate); resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #endif free(tempLUT); tempLUT = 0; } } } //configure strides uint64_t* axisStride = axis->specializationConstants.inputStride; uint64_t* usedStride = 0; if (app->useBluesteinFFT[0] && (FFTPlan->numAxisUploads[0] > 1)) { if (inverse) usedStride = FFTPlan->axes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.inputStride; else usedStride = FFTPlan->inverseBluesteinAxes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.outputStride; } else { if (inverse) usedStride = FFTPlan->axes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.inputStride; else usedStride = FFTPlan->axes[0][0].specializationConstants.outputStride; } axisStride[0] = usedStride[0]; axisStride[1] = usedStride[1]; axisStride[2] = usedStride[2]; axisStride[3] = usedStride[3]; axisStride[4] = usedStride[4]; axisStride = axis->specializationConstants.outputStride; usedStride = axis->specializationConstants.inputStride; axisStride[0] = usedStride[0]; axisStride[1] = usedStride[1]; axisStride[2] = usedStride[2]; axisStride[3] = usedStride[3]; axisStride[4] = usedStride[4]; axis->specializationConstants.inverse = inverse; uint64_t storageComplexSize; if (app->configuration.doublePrecision) storageComplexSize = (2 * sizeof(double)); else if (app->configuration.halfPrecision) storageComplexSize = (2 * 2); else storageComplexSize = (2 * sizeof(float)); uint64_t initPageSize = -1; uint64_t locBufferNum = 1; uint64_t locBufferSize = 0; /*for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { initPageSize += app->configuration.bufferSize[i]; } if (app->configuration.performConvolution) { uint64_t initPageSizeKernel = 0; for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { initPageSizeKernel += app->configuration.kernelSize[i]; } if (initPageSizeKernel > initPageSize) initPageSize = initPageSizeKernel; } if ((!((!app->configuration.reorderFourStep))) && (axis->specializationConstants.inputStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { initPageSize = app->configuration.localPageSize * 1024; }*/ uint64_t axis_id = 0; uint64_t axis_upload_id = 0; { if (inverse) { if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( ((axis_id == app->firstAxis) && (!inverse)) || ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) ) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.inputBufferNum; if (app->configuration.inputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { totalSize += app->configuration.inputBufferSize[i]; if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i]; } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; } else { if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.outputBufferNum; if (app->configuration.outputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { totalSize += app->configuration.outputBufferSize[i]; if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } else { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; } } } else { if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) )) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) )) || ((app->configuration.numberKernels > 1) && ( (inverse) || (axis_id == app->lastAxis))) ) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.outputBufferNum; if (app->configuration.outputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { totalSize += app->configuration.outputBufferSize[i]; if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } else { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } } } initPageSize = -1; locBufferNum = 1; locBufferSize = -1; { if (inverse) { if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.outputBufferNum; if (app->configuration.outputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { totalSize += app->configuration.outputBufferSize[i]; if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; } } axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } else { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } } else { if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) )) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) )) || ((app->configuration.numberKernels > 1) && ( (inverse) || (axis_id == app->lastAxis))) ) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.outputBufferNum; if (app->configuration.outputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { totalSize += app->configuration.outputBufferSize[i]; if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; } } axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } else { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } } } if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1; if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1; if (app->configuration.performConvolution) { //need fixing (not used now) uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; if (app->configuration.kernelSize) { for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { totalSize += app->configuration.kernelSize[i]; if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i]; } } axis->specializationConstants.kernelBlockSize = (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.kernelBlockNum = (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize)); //if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1; } else { axis->specializationConstants.kernelBlockSize = 0; axis->specializationConstants.kernelBlockNum = 0; } axis->numBindings = 2; axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum; axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum; axis->specializationConstants.numBuffersBound[2] = 0; axis->specializationConstants.numBuffersBound[3] = 0; #if(VKFFT_BACKEND==0) VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER }; descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.numBuffersBound[0] + axis->specializationConstants.numBuffersBound[1]); #endif if ((axis_id == 0) && (axis_upload_id == 0) && (app->configuration.FFTdim == 1) && (app->configuration.performConvolution)) { axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; #endif axis->numBindings++; } if (app->configuration.useLUT == 1) { axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount++; #endif axis->numBindings++; } #if(VKFFT_BACKEND==0) VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; descriptorPoolCreateInfo.poolSizeCount = 1; descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize; descriptorPoolCreateInfo.maxSets = 1; res = vkCreateDescriptorPool(app->configuration.device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL; } const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings; descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * sizeof(VkDescriptorSetLayoutBinding)); if (!descriptorSetLayoutBindings) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } for (uint64_t i = 0; i < axis->numBindings; ++i) { descriptorSetLayoutBindings[i].binding = (uint32_t)i; descriptorSetLayoutBindings[i].descriptorType = descriptorType; descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i]; descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; } VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings; descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings; res = vkCreateDescriptorSetLayout(app->configuration.device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT; } free(descriptorSetLayoutBindings); descriptorSetLayoutBindings = 0; VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO }; descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool; descriptorSetAllocateInfo.descriptorSetCount = 1; descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout; res = vkAllocateDescriptorSets(app->configuration.device[0], &descriptorSetAllocateInfo, &axis->descriptorSet); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS; } #endif if (app->configuration.specifyOffsetsAtLaunch) { axis->specializationConstants.performPostCompilationInputOffset = 1; axis->specializationConstants.performPostCompilationOutputOffset = 1; if (app->configuration.performConvolution) axis->specializationConstants.performPostCompilationKernelOffset = 1; } resFFT = VkFFTCheckUpdateBufferSet(app, axis, 1, 0); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, FFTPlan, axis, axis_id, axis_upload_id, inverse); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } { axis->axisBlock[0] = 128; if (axis->axisBlock[0] > app->configuration.maxThreadsNum) axis->axisBlock[0] = app->configuration.maxThreadsNum; axis->axisBlock[1] = 1; axis->axisBlock[2] = 1; uint64_t tempSize[3] = { (uint64_t)ceil((app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])), 1, 1 }; tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches * app->configuration.coordinateFeatures; if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > tempSize[0]) && (tempSize[1] >= tempSize[2])) { uint64_t temp_tempSize = tempSize[0]; tempSize[0] = tempSize[1]; tempSize[1] = temp_tempSize; axis->specializationConstants.swapComputeWorkGroupID = 1; } else { if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > tempSize[0]) && (tempSize[2] >= tempSize[1])) { uint64_t temp_tempSize = tempSize[0]; tempSize[0] = tempSize[2]; tempSize[2] = temp_tempSize; axis->specializationConstants.swapComputeWorkGroupID = 2; } } if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1; else axis->specializationConstants.performWorkGroupShift[0] = 0; if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1; else axis->specializationConstants.performWorkGroupShift[1] = 0; if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1; else axis->specializationConstants.performWorkGroupShift[2] = 0; axis->specializationConstants.localSize[0] = axis->axisBlock[0]; axis->specializationConstants.localSize[1] = axis->axisBlock[1]; axis->specializationConstants.localSize[2] = axis->axisBlock[2]; axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : app->configuration.coordinateFeatures; axis->specializationConstants.matrixConvolution = app->configuration.matrixConvolution; axis->specializationConstants.size[0] = app->configuration.size[0]; axis->specializationConstants.size[1] = app->configuration.size[1]; axis->specializationConstants.size[2] = app->configuration.size[2]; axis->specializationConstants.numBatches = app->configuration.numberBatches; if ((app->configuration.FFTdim == 1) && (app->configuration.size[1] == 1) && ((app->configuration.numberBatches == 1) && (app->actualNumBatches > 1)) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) { axis->specializationConstants.numBatches = app->actualNumBatches; } axis->specializationConstants.numKernels = app->configuration.numberKernels; axis->specializationConstants.sharedMemSize = app->configuration.sharedMemorySize; axis->specializationConstants.sharedMemSizePow2 = app->configuration.sharedMemorySizePow2; axis->specializationConstants.normalize = app->configuration.normalize; axis->specializationConstants.axis_id = 0; axis->specializationConstants.axis_upload_id = 0; for (uint64_t i = 0; i < 3; i++) { axis->specializationConstants.frequencyZeropadding = app->configuration.frequencyZeroPadding; axis->specializationConstants.performZeropaddingFull[i] = app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on) axis->specializationConstants.fft_zeropad_left_full[i] = app->configuration.fft_zeropad_left[i]; axis->specializationConstants.fft_zeropad_right_full[i] = app->configuration.fft_zeropad_right[i]; } /*if ((inverse)) { if ((app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) { axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id]; axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id]; } else axis->specializationConstants.zeropad[0] = 0; if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == 0)) { axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id]; axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id]; } else axis->specializationConstants.zeropad[1] = 0; } else { if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) { axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id]; axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id]; } else axis->specializationConstants.zeropad[0] = 0; if (((app->configuration.frequencyZeroPadding) && (axis_upload_id == 0)) || (((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)))) { axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id]; axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id]; } else axis->specializationConstants.zeropad[1] = 0; }*/ if ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)) { axis->specializationConstants.convolutionStep = 1; } else axis->specializationConstants.convolutionStep = 0; char floatTypeInputMemory[10]; char floatTypeOutputMemory[10]; char floatTypeKernelMemory[10]; char floatType[10]; axis->specializationConstants.unroll = 1; axis->specializationConstants.LUT = (app->configuration.useLUT == 1) ? 1 : 0; if (app->configuration.doublePrecision) { sprintf(floatType, "double"); sprintf(floatTypeInputMemory, "double"); sprintf(floatTypeOutputMemory, "double"); sprintf(floatTypeKernelMemory, "double"); //axis->specializationConstants.unroll = 1; } else { //axis->specializationConstants.unroll = 0; if (app->configuration.halfPrecision) { sprintf(floatType, "float"); if (app->configuration.halfPrecisionMemoryOnly) { //only out of place mode, input/output buffer must be different sprintf(floatTypeInputMemory, "float"); sprintf(floatTypeOutputMemory, "float"); sprintf(floatTypeKernelMemory, "float"); } else { sprintf(floatTypeInputMemory, "half"); sprintf(floatTypeOutputMemory, "half"); sprintf(floatTypeKernelMemory, "half"); } } else { if (app->configuration.doublePrecisionFloatMemory) { sprintf(floatType, "double"); sprintf(floatTypeInputMemory, "float"); sprintf(floatTypeOutputMemory, "float"); sprintf(floatTypeKernelMemory, "float"); } else { sprintf(floatType, "float"); sprintf(floatTypeInputMemory, "float"); sprintf(floatTypeOutputMemory, "float"); sprintf(floatTypeKernelMemory, "float"); } } } char uintType[20] = ""; if (!app->configuration.useUint64) { #if(VKFFT_BACKEND==0) sprintf(uintType, "uint"); #elif(VKFFT_BACKEND==1) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==2) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==3) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==4) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==5) sprintf(uintType, "uint"); #endif } else { #if(VKFFT_BACKEND==0) sprintf(uintType, "uint64_t"); #elif(VKFFT_BACKEND==1) sprintf(uintType, "unsigned long long"); #elif(VKFFT_BACKEND==2) sprintf(uintType, "unsigned long long"); #elif(VKFFT_BACKEND==3) sprintf(uintType, "unsigned long"); #elif(VKFFT_BACKEND==4) sprintf(uintType, "unsigned long"); #elif(VKFFT_BACKEND==5) sprintf(uintType, "ulong"); #endif } { axis->pushConstants.structSize = 0; if (axis->specializationConstants.performWorkGroupShift[0]) { axis->pushConstants.performWorkGroupShift[0] = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performWorkGroupShift[1]) { axis->pushConstants.performWorkGroupShift[1] = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performWorkGroupShift[2]) { axis->pushConstants.performWorkGroupShift[2] = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performPostCompilationInputOffset) { axis->pushConstants.performPostCompilationInputOffset = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performPostCompilationOutputOffset) { axis->pushConstants.performPostCompilationOutputOffset = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performPostCompilationKernelOffset) { axis->pushConstants.performPostCompilationKernelOffset = 1; axis->pushConstants.structSize += 1; } if (app->configuration.useUint64) axis->pushConstants.structSize *= sizeof(uint64_t); else axis->pushConstants.structSize *= sizeof(uint32_t); axis->specializationConstants.pushConstantsStructSize = axis->pushConstants.structSize; } //uint64_t LUT = app->configuration.useLUT; uint64_t type = 0; axis->specializationConstants.maxCodeLength = app->configuration.maxCodeLength; axis->specializationConstants.maxTempLength = app->configuration.maxTempLength; axis->specializationConstants.code0 = (char*)malloc(sizeof(char) * app->configuration.maxCodeLength); char* code0 = axis->specializationConstants.code0; if (!code0) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } resFFT = shaderGenVkFFT_R2C_decomposition(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type); freeShaderGenVkFFT(&axis->specializationConstants); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } #if(VKFFT_BACKEND==0) uint32_t* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); } else { glslang_resource_t default_resource = {}; default_resource.max_lights = 32; default_resource.max_clip_planes = 6; default_resource.max_texture_units = 32; default_resource.max_texture_coords = 32; default_resource.max_vertex_attribs = 64; default_resource.max_vertex_uniform_components = 4096; default_resource.max_varying_floats = 64; default_resource.max_vertex_texture_image_units = 32; default_resource.max_combined_texture_image_units = 80; default_resource.max_texture_image_units = 32; default_resource.max_fragment_uniform_components = 4096; default_resource.max_draw_buffers = 32; default_resource.max_vertex_uniform_vectors = 128; default_resource.max_varying_vectors = 8; default_resource.max_fragment_uniform_vectors = 16; default_resource.max_vertex_output_vectors = 16; default_resource.max_fragment_input_vectors = 15; default_resource.min_program_texel_offset = -8; default_resource.max_program_texel_offset = 7; default_resource.max_clip_distances = 8; default_resource.max_compute_work_group_count_x = (int)app->configuration.maxComputeWorkGroupCount[0]; default_resource.max_compute_work_group_count_y = (int)app->configuration.maxComputeWorkGroupCount[1]; default_resource.max_compute_work_group_count_z = (int)app->configuration.maxComputeWorkGroupCount[2]; default_resource.max_compute_work_group_size_x = (int)app->configuration.maxComputeWorkGroupSize[0]; default_resource.max_compute_work_group_size_y = (int)app->configuration.maxComputeWorkGroupSize[1]; default_resource.max_compute_work_group_size_z = (int)app->configuration.maxComputeWorkGroupSize[2]; default_resource.max_compute_uniform_components = 1024; default_resource.max_compute_texture_image_units = 16; default_resource.max_compute_image_uniforms = 8; default_resource.max_compute_atomic_counters = 8; default_resource.max_compute_atomic_counter_buffers = 1; default_resource.max_varying_components = 60; default_resource.max_vertex_output_components = 64; default_resource.max_geometry_input_components = 64; default_resource.max_geometry_output_components = 128; default_resource.max_fragment_input_components = 128; default_resource.max_image_units = 8; default_resource.max_combined_image_units_and_fragment_outputs = 8; default_resource.max_combined_shader_output_resources = 8; default_resource.max_image_samples = 0; default_resource.max_vertex_image_uniforms = 0; default_resource.max_tess_control_image_uniforms = 0; default_resource.max_tess_evaluation_image_uniforms = 0; default_resource.max_geometry_image_uniforms = 0; default_resource.max_fragment_image_uniforms = 8; default_resource.max_combined_image_uniforms = 8; default_resource.max_geometry_texture_image_units = 16; default_resource.max_geometry_output_vertices = 256; default_resource.max_geometry_total_output_components = 1024; default_resource.max_geometry_uniform_components = 1024; default_resource.max_geometry_varying_components = 64; default_resource.max_tess_control_input_components = 128; default_resource.max_tess_control_output_components = 128; default_resource.max_tess_control_texture_image_units = 16; default_resource.max_tess_control_uniform_components = 1024; default_resource.max_tess_control_total_output_components = 4096; default_resource.max_tess_evaluation_input_components = 128; default_resource.max_tess_evaluation_output_components = 128; default_resource.max_tess_evaluation_texture_image_units = 16; default_resource.max_tess_evaluation_uniform_components = 1024; default_resource.max_tess_patch_components = 120; default_resource.max_patch_vertices = 32; default_resource.max_tess_gen_level = 64; default_resource.max_viewports = 16; default_resource.max_vertex_atomic_counters = 0; default_resource.max_tess_control_atomic_counters = 0; default_resource.max_tess_evaluation_atomic_counters = 0; default_resource.max_geometry_atomic_counters = 0; default_resource.max_fragment_atomic_counters = 8; default_resource.max_combined_atomic_counters = 8; default_resource.max_atomic_counter_bindings = 1; default_resource.max_vertex_atomic_counter_buffers = 0; default_resource.max_tess_control_atomic_counter_buffers = 0; default_resource.max_tess_evaluation_atomic_counter_buffers = 0; default_resource.max_geometry_atomic_counter_buffers = 0; default_resource.max_fragment_atomic_counter_buffers = 1; default_resource.max_combined_atomic_counter_buffers = 1; default_resource.max_atomic_counter_buffer_size = 16384; default_resource.max_transform_feedback_buffers = 4; default_resource.max_transform_feedback_interleaved_components = 64; default_resource.max_cull_distances = 8; default_resource.max_combined_clip_and_cull_distances = 8; default_resource.max_samples = 4; default_resource.max_mesh_output_vertices_nv = 256; default_resource.max_mesh_output_primitives_nv = 512; default_resource.max_mesh_work_group_size_x_nv = 32; default_resource.max_mesh_work_group_size_y_nv = 1; default_resource.max_mesh_work_group_size_z_nv = 1; default_resource.max_task_work_group_size_x_nv = 32; default_resource.max_task_work_group_size_y_nv = 1; default_resource.max_task_work_group_size_z_nv = 1; default_resource.max_mesh_view_count_nv = 4; default_resource.limits.non_inductive_for_loops = 1; default_resource.limits.while_loops = 1; default_resource.limits.do_while_loops = 1; default_resource.limits.general_uniform_indexing = 1; default_resource.limits.general_attribute_matrix_vector_indexing = 1; default_resource.limits.general_varying_indexing = 1; default_resource.limits.general_sampler_indexing = 1; default_resource.limits.general_variable_indexing = 1; default_resource.limits.general_constant_matrix_vector_indexing = 1; glslang_target_client_version_t client_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0; glslang_target_language_version_t target_language_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0; glslang_input_t input = { GLSLANG_SOURCE_GLSL, GLSLANG_STAGE_COMPUTE, GLSLANG_CLIENT_VULKAN, client_version, GLSLANG_TARGET_SPV, target_language_version, code0, 450, GLSLANG_NO_PROFILE, 1, 0, GLSLANG_MSG_DEFAULT_BIT, (const glslang_resource_t*)&default_resource, }; //printf("%s\n", code0); glslang_shader_t* shader = glslang_shader_create((const glslang_input_t*)&input); const char* err; if (!glslang_shader_preprocess(shader, &input)) { err = glslang_shader_get_info_log(shader); printf("%s\n", code0); printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); glslang_shader_delete(shader); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SHADER_PREPROCESS; } if (!glslang_shader_parse(shader, &input)) { err = glslang_shader_get_info_log(shader); printf("%s\n", code0); printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); glslang_shader_delete(shader); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SHADER_PARSE; } glslang_program_t* program = glslang_program_create(); glslang_program_add_shader(program, shader); if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT)) { err = glslang_program_get_info_log(program); printf("%s\n", code0); printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); glslang_shader_delete(shader); glslang_program_delete(program); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SHADER_LINK; } glslang_program_SPIRV_generate(program, input.stage); if (glslang_program_SPIRV_get_messages(program)) { printf("%s", glslang_program_SPIRV_get_messages(program)); glslang_shader_delete(shader); glslang_program_delete(program); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SPIRV_GENERATE; } glslang_shader_delete(shader); uint32_t* tempCode = glslang_program_SPIRV_get_ptr(program); codeSize = glslang_program_SPIRV_get_size(program) * sizeof(uint32_t); axis->binarySize = codeSize; code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; glslang_program_delete(program); deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } axis->binary = code; memcpy(code, tempCode, codeSize); glslang_program_delete(program); } VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO }; createInfo.pCode = code; createInfo.codeSize = codeSize; res = vkCreateShaderModule(app->configuration.device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module); if (res != VK_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; } VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; pipelineLayoutCreateInfo.setLayoutCount = 1; pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout; VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT }; pushConstantRange.offset = 0; pushConstantRange.size = (uint32_t)axis->pushConstants.structSize; // Push constant ranges are part of the pipeline layout if (axis->pushConstants.structSize) { pipelineLayoutCreateInfo.pushConstantRangeCount = 1; pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; } res = vkCreatePipelineLayout(app->configuration.device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT; } pipelineShaderStageCreateInfo.pName = "main"; pipelineShaderStageCreateInfo.pSpecializationInfo = 0;// &specializationInfo; computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo; computePipelineCreateInfo.layout = axis->pipelineLayout; if (app->configuration.pipelineCache) res = vkCreateComputePipelines(app->configuration.device[0], app->configuration.pipelineCache[0], 1, &computePipelineCreateInfo, 0, &axis->pipeline); else res = vkCreateComputePipelines(app->configuration.device[0], 0, 1, &computePipelineCreateInfo, 0, &axis->pipeline); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE; } vkDestroyShaderModule(app->configuration.device[0], pipelineShaderStageCreateInfo.module, 0); if (!app->configuration.saveApplicationToString) { free(code); code = 0; } #elif(VKFFT_BACKEND==1) char* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); } else { nvrtcProgram prog; nvrtcResult result = nvrtcCreateProgram(&prog, // prog code0, // buffer "VkFFT.cu", // name 0, // numHeaders 0, // headers 0); // includeNames //free(includeNames); //free(headers); if (result != NVRTC_SUCCESS) { printf("nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } #if (CUDA_VERSION >= 11030) char* opts[5]; opts[0] = (char*)malloc(sizeof(char) * 50); if (!opts[0]) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } sprintf(opts[0], "--gpu-architecture=sm_%" PRIu64 "%" PRIu64 "", app->configuration.computeCapabilityMajor, app->configuration.computeCapabilityMinor); //result = nvrtcAddNameExpression(prog, "&consts"); //if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result)); result = nvrtcCompileProgram(prog, // prog 1, // numOptions (const char* const*)opts); // options free(opts[0]); #else result = nvrtcCompileProgram(prog, // prog 0, // numOptions 0); // options #endif if (result != NVRTC_SUCCESS) { printf("nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result)); char* log = (char*)malloc(sizeof(char) * 4000000); if (!log) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } else { nvrtcGetProgramLog(prog, log); printf("%s\n", log); free(log); log = 0; printf("%s\n", code0); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } #if (CUDA_VERSION >= 11030) result = nvrtcGetCUBINSize(prog, &codeSize); #else result = nvrtcGetPTXSize(prog, &codeSize); #endif if (result != NVRTC_SUCCESS) { #if (CUDA_VERSION >= 11030) printf("nvrtcGetCUBINSize error: %s\n", nvrtcGetErrorString(result)); #else printf("nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result)); #endif free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; } axis->binarySize = codeSize; code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } axis->binary = code; #if (CUDA_VERSION >= 11030) result = nvrtcGetCUBIN(prog, code); #else result = nvrtcGetPTX(prog, code); #endif if (result != NVRTC_SUCCESS) { #if (CUDA_VERSION >= 11030) printf("nvrtcGetCUBIN error: %s\n", nvrtcGetErrorString(result)); #else printf("nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result)); #endif free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE; } result = nvrtcDestroyProgram(&prog); if (result != NVRTC_SUCCESS) { printf("nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result)); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; } } CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); if (result2 != CUDA_SUCCESS) { printf("cuModuleLoadDataEx error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; } result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main_R2C"); if (result2 != CUDA_SUCCESS) { printf("cuModuleGetFunction error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; } if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (int)axis->specializationConstants.usedSharedMemory); if (result2 != CUDA_SUCCESS) { printf("cuFuncSetAttribute error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; } } if (axis->pushConstants.structSize) { size_t size = axis->pushConstants.structSize; result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); if (result2 != CUDA_SUCCESS) { printf("cuModuleGetGlobal error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; } } if (!app->configuration.saveApplicationToString) { free(code); code = 0; } #elif(VKFFT_BACKEND==2) uint32_t* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); } else { hiprtcProgram prog; enum hiprtcResult result = hiprtcCreateProgram(&prog, // prog code0, // buffer "VkFFT.hip", // name 0, // numHeaders 0, // headers 0); // includeNames if (result != HIPRTC_SUCCESS) { printf("hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } if (axis->pushConstants.structSize) { result = hiprtcAddNameExpression(prog, "&consts"); if (result != HIPRTC_SUCCESS) { printf("hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION; } } result = hiprtcCompileProgram(prog, // prog 0, // numOptions 0); // options if (result != HIPRTC_SUCCESS) { printf("hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result)); char* log = (char*)malloc(sizeof(char) * 100000); if (!log) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } else { hiprtcGetProgramLog(prog, log); printf("%s\n", log); free(log); log = 0; printf("%s\n", code0); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } result = hiprtcGetCodeSize(prog, &codeSize); if (result != HIPRTC_SUCCESS) { printf("hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE; } axis->binarySize = codeSize; code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } axis->binary = code; result = hiprtcGetCode(prog, (char*)code); if (result != HIPRTC_SUCCESS) { printf("hiprtcGetCode error: %s\n", hiprtcGetErrorString(result)); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; } //printf("%s\n", code); // Destroy the program. result = hiprtcDestroyProgram(&prog); if (result != HIPRTC_SUCCESS) { printf("hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result)); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; } } hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); if (result2 != hipSuccess) { printf("hipModuleLoadDataEx error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; } result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main_R2C"); if (result2 != hipSuccess) { printf("hipModuleGetFunction error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; } if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (int)axis->specializationConstants.usedSharedMemory); //result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared); if (result2 != hipSuccess) { printf("hipFuncSetAttribute error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; } } if (axis->pushConstants.structSize) { size_t size = axis->pushConstants.structSize; result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); if (result2 != hipSuccess) { printf("hipModuleGetGlobal error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; } } if (!app->configuration.saveApplicationToString) { free(code); code = 0; } #elif(VKFFT_BACKEND==3) if (app->configuration.loadApplicationFromString) { char* code; uint64_t codeSize; char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); size_t codeSize_size_t = (size_t)codeSize; code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); const unsigned char* temp_code = (const unsigned char*)code; axis->program = clCreateProgramWithBinary(app->configuration.context[0], 1, app->configuration.device, &codeSize_size_t, (const unsigned char**)(&temp_code), 0, &res); if (res != CL_SUCCESS) { free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } free(code); code = 0; } else { size_t codelen = strlen(code0); const char* temp_code = (const char*)code0; axis->program = clCreateProgramWithSource(app->configuration.context[0], 1, (const char**)&temp_code, &codelen, &res); if (res != CL_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } } res = clBuildProgram(axis->program, 1, app->configuration.device, 0, 0, 0); if (res != CL_SUCCESS) { size_t log_size; clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size); char* log = (char*)malloc(log_size); if (!log) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } else { clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0); printf("%s\n", log); free(log); log = 0; printf("%s\n", code0); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } if (app->configuration.saveApplicationToString) { size_t codeSize; res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &codeSize, NULL); if (res != CL_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } axis->binarySize = (uint64_t)codeSize; axis->binary = (char*)malloc(axis->binarySize); if (!axis->binary) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARIES, codeSize, &axis->binary, NULL); if (res != CL_SUCCESS) { if (app->configuration.saveApplicationToString) { free(axis->binary); axis->binary = 0; } free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } axis->kernel = clCreateKernel(axis->program, "VkFFT_main_R2C", &res); if (res != CL_SUCCESS) { if (app->configuration.saveApplicationToString) { free(axis->binary); axis->binary = 0; } free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; } #elif(VKFFT_BACKEND==4) uint32_t* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; ze_module_desc_t moduleDesc = { ZE_STRUCTURE_TYPE_MODULE_DESC, 0, ZE_MODULE_FORMAT_NATIVE, codeSize, (uint8_t*)code, pBuildFlags, 0 }; res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); if (res != ZE_RESULT_SUCCESS) { free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } free(code); code = 0; } else { size_t codelen = strlen(code0); uint64_t successOpen = 0; FILE* temp; char fname_cl[100]; char fname_bc[100]; char fname_spv[100]; int name_id = 0; while (!successOpen) { sprintf(fname_cl, "VkFFT_temp_cl_%d.cl", name_id); temp = fopen(fname_cl, "r"); if (temp != 0) { fclose(temp); name_id++; } else { successOpen = 1; sprintf(fname_bc, "VkFFT_temp_bc_%d.spv", name_id); sprintf(fname_spv, "VkFFT_temp_cl_%d.spv", name_id); } } temp = fopen(fname_cl, "w"); fwrite(code0, 1, codelen, temp); fclose(temp); char system_call[500]; sprintf(system_call, "clang -c -target spir64 -O0 -emit-llvm -o %s %s", fname_bc, fname_cl); system(system_call); sprintf(system_call, "llvm-spirv -o %s %s", fname_spv, fname_bc); system(system_call); temp = fopen(fname_spv, "rb"); fseek(temp, 0L, SEEK_END); uint64_t spv_size = ftell(temp); rewind(temp); uint8_t* spv_binary = (uint8_t*)malloc(spv_size); if (!spv_binary) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } fread(spv_binary, 1, spv_size, temp); fclose(temp); remove(fname_cl); remove(fname_bc); remove(fname_spv); const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; ze_module_desc_t moduleDesc = { ZE_STRUCTURE_TYPE_MODULE_DESC, 0, ZE_MODULE_FORMAT_IL_SPIRV, spv_size, spv_binary, pBuildFlags, 0 }; res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); if (res != ZE_RESULT_SUCCESS) { free(spv_binary); spv_binary = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } free(spv_binary); spv_binary = 0; if (app->configuration.saveApplicationToString) { size_t codeSize; res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, 0); if (res != ZE_RESULT_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } axis->binarySize = codeSize; axis->binary = (char*)malloc(axis->binarySize); if (!axis->binary) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, (uint8_t*)axis->binary); if (res != ZE_RESULT_SUCCESS) { free(axis->binary); axis->binary = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } } ze_kernel_desc_t kernelDesc = { ZE_STRUCTURE_TYPE_KERNEL_DESC, 0, 0, // flags "VkFFT_main_R2C" }; res = zeKernelCreate(axis->VkFFTModule, &kernelDesc, &axis->VkFFTKernel); if (res != ZE_RESULT_SUCCESS) { if (app->configuration.saveApplicationToString) { free(axis->binary); axis->binary = 0; } free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; } #elif(VKFFT_BACKEND==5) NS::Error* error; if (app->configuration.loadApplicationFromString) { char* code; uint64_t codeSize; char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); size_t codeSize_size_t = (size_t)codeSize; code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); dispatch_data_t data = dispatch_data_create(code, codeSize, 0, 0); axis->library = app->configuration.device->newLibrary(data, &error); free(code); code = 0; } else { size_t codelen = strlen(code0); MTL::CompileOptions* compileOptions = MTL::CompileOptions::alloc(); compileOptions->setFastMathEnabled(true); NS::String* str = NS::String::string(code0, NS::UTF8StringEncoding); axis->library = app->configuration.device->newLibrary(str, compileOptions, &error); if (error) { printf("%s\n%s\n", error->debugDescription()->cString(NS::ASCIIStringEncoding), error->localizedDescription()->cString(NS::ASCIIStringEncoding)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } compileOptions->release(); if (app->configuration.saveApplicationToString) { } str->release(); } const char function_name[20] = "VkFFT_main_R2C"; NS::String* str = NS::String::string(function_name, NS::UTF8StringEncoding); MTL::Function* function = axis->library->newFunction(str); axis->pipeline = app->configuration.device->newComputePipelineState(function, &error); function->release(); str->release(); #endif if (!app->configuration.keepShaderCode) { free(code0); code0 = 0; axis->specializationConstants.code0 = 0; } } return resFFT; } static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse, uint64_t reverseBluesteinMultiUpload) { //get radix stages VkFFTResult resFFT = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) VkResult res = VK_SUCCESS; #elif(VKFFT_BACKEND==1) cudaError_t res = cudaSuccess; #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; #elif(VKFFT_BACKEND==3) cl_int res = CL_SUCCESS; #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; #elif(VKFFT_BACKEND==5) #endif VkFFTAxis* axis = (reverseBluesteinMultiUpload) ? &FFTPlan->inverseBluesteinAxes[axis_id][axis_upload_id] : &FFTPlan->axes[axis_id][axis_upload_id]; axis->specializationConstants.sourceFFTSize = app->configuration.size[axis_id]; if ((app->configuration.FFTdim == 1) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] == 1) && ((app->configuration.numberBatches > 1) || (app->actualNumBatches > 1)) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) { if (app->configuration.numberBatches > 1) { app->actualNumBatches = app->configuration.numberBatches; app->configuration.numberBatches = 1; } FFTPlan->actualFFTSizePerAxis[axis_id][1] = app->actualNumBatches; } axis->specializationConstants.numBatches = app->configuration.numberBatches; axis->specializationConstants.warpSize = app->configuration.warpSize; axis->specializationConstants.numSharedBanks = app->configuration.numSharedBanks; axis->specializationConstants.useUint64 = app->configuration.useUint64; #if(VKFFT_BACKEND==2) axis->specializationConstants.useStrict32BitAddress = app->configuration.useStrict32BitAddress; #endif axis->specializationConstants.disableSetLocale = app->configuration.disableSetLocale; axis->specializationConstants.numAxisUploads = FFTPlan->numAxisUploads[axis_id]; axis->specializationConstants.fixMinRaderPrimeMult = app->configuration.fixMinRaderPrimeMult; axis->specializationConstants.fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult; axis->specializationConstants.fixMinRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT; axis->specializationConstants.fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT; axis->specializationConstants.raderUintLUT = (axis->specializationConstants.useRader) ? app->configuration.useRaderUintLUT : 0; axis->specializationConstants.inline_rader_g_pow = (axis->specializationConstants.raderUintLUT) ? 2 : 1; axis->specializationConstants.inline_rader_kernel = (app->configuration.useLUT == 1) ? 0 : 1; uint64_t complexSize; if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { axis->specializationConstants.precision = 1; complexSize = (2 * sizeof(double)); } else { if (app->configuration.halfPrecision) { axis->specializationConstants.precision = 0; complexSize = (2 * sizeof(float)); } else { axis->specializationConstants.precision = 0; complexSize = (2 * sizeof(float)); } } axis->specializationConstants.complexSize = complexSize; axis->specializationConstants.supportAxis = 0; axis->specializationConstants.symmetricKernel = app->configuration.symmetricKernel; axis->specializationConstants.conjugateConvolution = app->configuration.conjugateConvolution; axis->specializationConstants.crossPowerSpectrumNormalization = app->configuration.crossPowerSpectrumNormalization; uint64_t allowedSharedMemory = app->configuration.sharedMemorySize; uint64_t allowedSharedMemoryPow2 = app->configuration.sharedMemorySizePow2; if (axis->specializationConstants.useRaderMult) { allowedSharedMemory -= (axis->specializationConstants.useRaderMult - 1) * complexSize; allowedSharedMemoryPow2 -= (axis->specializationConstants.useRaderMult - 1) * complexSize; } uint64_t maxSequenceLengthSharedMemory = allowedSharedMemory / complexSize; uint64_t maxSequenceLengthSharedMemoryPow2 = allowedSharedMemoryPow2 / complexSize; uint64_t maxSingleSizeStrided = (app->configuration.coalescedMemory > complexSize) ? allowedSharedMemory / (app->configuration.coalescedMemory) : allowedSharedMemory / complexSize; uint64_t maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > complexSize) ? allowedSharedMemoryPow2 / (app->configuration.coalescedMemory) : allowedSharedMemoryPow2 / complexSize; axis->specializationConstants.stageStartSize = 1; for (uint64_t i = 0; i < axis_upload_id; i++) axis->specializationConstants.stageStartSize *= FFTPlan->axisSplit[axis_id][i]; axis->specializationConstants.firstStageStartSize = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / FFTPlan->axisSplit[axis_id][FFTPlan->numAxisUploads[axis_id] - 1]; axis->specializationConstants.dispatchZactualFFTSize = (axis_id < 2) ? FFTPlan->actualFFTSizePerAxis[axis_id][2] : FFTPlan->actualFFTSizePerAxis[axis_id][1]; if (axis_id == 0) { //configure radix stages axis->specializationConstants.fft_dim_x = axis->specializationConstants.stageStartSize; } else { axis->specializationConstants.fft_dim_x = FFTPlan->actualFFTSizePerAxis[axis_id][0]; } if (app->useBluesteinFFT[axis_id]) { axis->specializationConstants.useBluesteinFFT = 1; } if (app->configuration.performDCT == 3) { axis->specializationConstants.actualInverse = inverse; axis->specializationConstants.inverse = !inverse; } else { if (app->configuration.performDCT == 4) { axis->specializationConstants.actualInverse = inverse; axis->specializationConstants.inverse = 1; } else { axis->specializationConstants.actualInverse = inverse; axis->specializationConstants.inverse = inverse; } } if (app->useBluesteinFFT[axis_id]) { axis->specializationConstants.actualInverse = inverse; axis->specializationConstants.inverse = reverseBluesteinMultiUpload; if (app->configuration.performDCT == 3) { axis->specializationConstants.inverseBluestein = !inverse; } else { if (app->configuration.performDCT == 4) { axis->specializationConstants.inverseBluestein = 1; } else { axis->specializationConstants.inverseBluestein = inverse; } } } axis->specializationConstants.reverseBluesteinMultiUpload = reverseBluesteinMultiUpload; axis->specializationConstants.reorderFourStep = ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->useBluesteinFFT[axis_id])) ? app->configuration.reorderFourStep : 0; if ((axis_id == 0) && ((FFTPlan->numAxisUploads[axis_id] == 1) || ((axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)))) { maxSequenceLengthSharedMemory *= axis->specializationConstants.registerBoost; maxSequenceLengthSharedMemoryPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSequenceLengthSharedMemory)); } else { maxSingleSizeStrided *= axis->specializationConstants.registerBoost; maxSingleSizeStridedPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSingleSizeStrided)); } axis->specializationConstants.performR2C = FFTPlan->actualPerformR2CPerAxis[axis_id]; axis->specializationConstants.performR2CmultiUpload = FFTPlan->multiUploadR2C; if (app->configuration.performDCT == 3) { axis->specializationConstants.performDCT = 2; } else { axis->specializationConstants.performDCT = app->configuration.performDCT; } if ((axis->specializationConstants.performR2CmultiUpload) && (app->configuration.size[0] % 2 != 0)) return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C; uint64_t additionalR2Cshared = 0; if ((axis->specializationConstants.performR2C || ((axis->specializationConstants.performDCT == 2) || ((axis->specializationConstants.performDCT == 4) && ((axis->specializationConstants.fftDim % 2) != 0)))) && (axis->specializationConstants.axis_id == 0) && (!axis->specializationConstants.performR2CmultiUpload)) { additionalR2Cshared = ((axis->specializationConstants.fftDim % 2) == 0) ? 2 : 1; if ((axis->specializationConstants.performDCT == 2) || ((axis->specializationConstants.performDCT == 4) && ((axis->specializationConstants.fftDim % 2) != 0))) additionalR2Cshared = 1; } axis->specializationConstants.mergeSequencesR2C = (((axis->specializationConstants.fftDim + additionalR2Cshared) <= maxSequenceLengthSharedMemory) && ((FFTPlan->actualFFTSizePerAxis[axis_id][1] % 2) == 0) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || (((app->configuration.performDCT == 3) || (app->configuration.performDCT == 2) || (app->configuration.performDCT == 1) || ((app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0)))) ? (1 - app->configuration.disableMergeSequencesR2C) : 0; //uint64_t passID = FFTPlan->numAxisUploads[axis_id] - 1 - axis_upload_id; axis->specializationConstants.fft_dim_full = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; if ((FFTPlan->numAxisUploads[axis_id] > 1) && (axis->specializationConstants.reorderFourStep || app->useBluesteinFFT[axis_id]) && (!app->configuration.userTempBuffer) && (app->configuration.allocateTempBuffer == 0)) { app->configuration.allocateTempBuffer = 1; #if(VKFFT_BACKEND==0) app->configuration.tempBuffer = (VkBuffer*)malloc(sizeof(VkBuffer)); if (!app->configuration.tempBuffer) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } resFFT = allocateFFTBuffer(app, app->configuration.tempBuffer, &app->configuration.tempBufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, app->configuration.tempBufferSize[0]); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } #elif(VKFFT_BACKEND==1) app->configuration.tempBuffer = (void**)malloc(sizeof(void*)); if (!app->configuration.tempBuffer) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } res = cudaMalloc(app->configuration.tempBuffer, app->configuration.tempBufferSize[0]); if (res != cudaSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==2) app->configuration.tempBuffer = (void**)malloc(sizeof(void*)); if (!app->configuration.tempBuffer) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } res = hipMalloc(app->configuration.tempBuffer, app->configuration.tempBufferSize[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==3) app->configuration.tempBuffer = (cl_mem*)malloc(sizeof(cl_mem)); if (!app->configuration.tempBuffer) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } app->configuration.tempBuffer[0] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, app->configuration.tempBufferSize[0], 0, &res); if (res != CL_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==4) app->configuration.tempBuffer = (void**)malloc(sizeof(void*)); if (!app->configuration.tempBuffer) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, app->configuration.tempBufferSize[0], sizeof(float), app->configuration.device[0], app->configuration.tempBuffer); if (res != ZE_RESULT_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==5) app->configuration.tempBuffer = (MTL::Buffer**)malloc(sizeof(MTL::Buffer*)); if (!app->configuration.tempBuffer) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } app->configuration.tempBuffer[0] = app->configuration.device->newBuffer(app->configuration.tempBufferSize[0], MTL::ResourceStorageModePrivate); #endif } //generate Rader Kernels resFFT = VkFFTGenerateRaderFFTKernel(app, axis); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } //allocate LUT if (app->configuration.useLUT == 1) { uint64_t dimMult = 1; uint64_t maxStageSum = 0; for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) { if (i > 0) { switch (axis->specializationConstants.stageRadix[i]) { case 2: maxStageSum += dimMult; break; case 3: maxStageSum += dimMult * 2; break; case 4: maxStageSum += dimMult * 2; break; case 5: maxStageSum += dimMult * 4; break; case 6: maxStageSum += dimMult * 5; break; case 7: maxStageSum += dimMult * 6; break; case 8: maxStageSum += dimMult * 3; break; case 9: maxStageSum += dimMult * 8; break; case 10: maxStageSum += dimMult * 9; break; case 11: maxStageSum += dimMult * 10; break; case 12: maxStageSum += dimMult * 11; break; case 13: maxStageSum += dimMult * 12; break; case 14: maxStageSum += dimMult * 13; break; case 15: maxStageSum += dimMult * 14; break; case 16: maxStageSum += dimMult * 4; break; case 32: maxStageSum += dimMult * 5; break; default: maxStageSum += dimMult * (axis->specializationConstants.stageRadix[i]); break; } } dimMult *= axis->specializationConstants.stageRadix[i]; } axis->specializationConstants.maxStageSumLUT = maxStageSum; dimMult = 1; for (uint64_t k = 0; k < axis->specializationConstants.numRaderPrimes; k++) { if (axis->specializationConstants.raderContainer[k].type == 0) { axis->specializationConstants.raderContainer[k].RaderRadixOffsetLUT = maxStageSum; for (uint64_t i = 0; i < axis->specializationConstants.raderContainer[k].numStages; i++) { if (i > 0) { switch (axis->specializationConstants.raderContainer[k].stageRadix[i]) { case 2: maxStageSum += dimMult; break; case 3: maxStageSum += dimMult * 2; break; case 4: maxStageSum += dimMult * 2; break; case 5: maxStageSum += dimMult * 4; break; case 6: maxStageSum += dimMult * 5; break; case 7: maxStageSum += dimMult * 6; break; case 8: maxStageSum += dimMult * 3; break; case 9: maxStageSum += dimMult * 8; break; case 10: maxStageSum += dimMult * 9; break; case 11: maxStageSum += dimMult * 10; break; case 12: maxStageSum += dimMult * 11; break; case 13: maxStageSum += dimMult * 12; break; case 14: maxStageSum += dimMult * 13; break; case 15: maxStageSum += dimMult * 14; break; case 16: maxStageSum += dimMult * 4; break; case 32: maxStageSum += dimMult * 5; break; default: maxStageSum += dimMult * (axis->specializationConstants.raderContainer[k].stageRadix[i]); break; } } dimMult *= axis->specializationConstants.raderContainer[k].stageRadix[i]; } axis->specializationConstants.maxStageSumLUT = maxStageSum; dimMult = 1; } } //iFFT LUT dimMult = 1; for (uint64_t k = 0; k < axis->specializationConstants.numRaderPrimes; k++) { if (axis->specializationConstants.raderContainer[k].type == 0) { axis->specializationConstants.raderContainer[k].RaderRadixOffsetLUTiFFT = maxStageSum; for (int64_t i = axis->specializationConstants.raderContainer[k].numStages - 1; i >= 0; i--) { if (i < (int64_t)axis->specializationConstants.raderContainer[k].numStages - 1) { switch (axis->specializationConstants.raderContainer[k].stageRadix[i]) { case 2: maxStageSum += dimMult; break; case 3: maxStageSum += dimMult * 2; break; case 4: maxStageSum += dimMult * 2; break; case 5: maxStageSum += dimMult * 4; break; case 6: maxStageSum += dimMult * 5; break; case 7: maxStageSum += dimMult * 6; break; case 8: maxStageSum += dimMult * 3; break; case 9: maxStageSum += dimMult * 8; break; case 10: maxStageSum += dimMult * 9; break; case 11: maxStageSum += dimMult * 10; break; case 12: maxStageSum += dimMult * 11; break; case 13: maxStageSum += dimMult * 12; break; case 14: maxStageSum += dimMult * 13; break; case 15: maxStageSum += dimMult * 14; break; case 16: maxStageSum += dimMult * 4; break; case 32: maxStageSum += dimMult * 5; break; default: maxStageSum += dimMult * (axis->specializationConstants.raderContainer[k].stageRadix[i]); break; } } dimMult *= axis->specializationConstants.raderContainer[k].stageRadix[i]; } axis->specializationConstants.maxStageSumLUT = maxStageSum; dimMult = 1; } } if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { long double double_PI = 3.14159265358979323846264338327950288419716939937510L; if (axis_upload_id > 0) { if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT += axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim; axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(double); } else { if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT += axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim; axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2)); axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(double); } else axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(double); } if (app->configuration.useLUT_4step == 1) axis->bufferLUTSize += axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim * 2 * sizeof(double); } else { if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(double); } else { if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2)); axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(double); } else axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(double); } } if (axis->specializationConstants.useRader) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (!axis->specializationConstants.inline_rader_kernel) { axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT = axis->bufferLUTSize / (2 * sizeof(double)); axis->bufferLUTSize += (axis->specializationConstants.raderContainer[i].prime - 1) * 2 * sizeof(double); } } } if (axis->bufferLUTSize == 0) axis->bufferLUTSize = sizeof(double); double* tempLUT = (double*)malloc(axis->bufferLUTSize); if (!tempLUT) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } uint64_t localStageSize = axis->specializationConstants.stageRadix[0]; uint64_t localStageSum = 0; for (uint64_t i = 1; i < axis->specializationConstants.numStages; i++) { if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) { for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum)] = (double)cos(j * double_PI / localStageSize / pow(2, k)); tempLUT[2 * (j + localStageSum) + 1] = (double)sin(j * double_PI / localStageSize / pow(2, k)); } localStageSum += localStageSize; } } else if (axis->specializationConstants.rader_generator[i] > 0) { for (uint64_t j = 0; j < localStageSize; j++) { for (int64_t k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) { tempLUT[2 * (k + localStageSum)] = (double)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); tempLUT[2 * (k + localStageSum) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); } localStageSum += (axis->specializationConstants.stageRadix[i]); } } else { for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum)] = (double)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); tempLUT[2 * (j + localStageSum) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); } localStageSum += localStageSize; } } localStageSize *= axis->specializationConstants.stageRadix[i]; } if (axis->specializationConstants.useRader) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type) { if (!axis->specializationConstants.inline_rader_kernel) { for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later uint64_t g_pow = 1; for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; } tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); } } } else { localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[0]; localStageSum = 0; for (uint64_t l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) { if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)cos(j * double_PI / localStageSize / pow(2, k)); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)sin(j * double_PI / localStageSize / pow(2, k)); } localStageSum += localStageSize; } } else { for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (double)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); } localStageSum += localStageSize; } } localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; } localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[axis->specializationConstants.raderContainer[i].numStages - 1]; localStageSum = 0; for (int64_t l = (int64_t)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) { if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)cos(j * double_PI / localStageSize / pow(2, k)); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)sin(j * double_PI / localStageSize / pow(2, k)); } localStageSum += localStageSize; } } else { for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (double)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (double)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); } localStageSum += localStageSize; } } localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; } if (!axis->specializationConstants.inline_rader_kernel) { double* raderFFTkernel = (double*)axis->specializationConstants.raderContainer[i].raderFFTkernel; for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (double)(raderFFTkernel[2 * j] / (long double)(axis->specializationConstants.raderContainer[i].prime - 1)); tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (double)(raderFFTkernel[2 * j + 1] / (long double)(axis->specializationConstants.raderContainer[i].prime - 1)); } } } } } if ((axis_upload_id > 0) && (app->configuration.useLUT_4step == 1)) { for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) { for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) { long double angle = 2 * double_PI * ((i * j) / (long double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim)); tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = (double)cos(angle); tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = (double)sin(angle); } } } if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2 + 2; j++) { long double angle = (double_PI / 2.0 / (long double)(app->configuration.size[axis_id])) * j; tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (double)cos(angle); tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (double)sin(angle); } } if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { for (uint64_t j = 0; j < app->configuration.size[axis_id] / 4 + 2; j++) { long double angle = (double_PI / 2.0 / (long double)(app->configuration.size[axis_id] / 2)) * j; tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (double)cos(angle); tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (double)sin(angle); } for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2; j++) { long double angle = (-double_PI / 8.0 / (long double)(app->configuration.size[axis_id] / 2)) * (2 * j + 1); tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = (double)cos(angle); tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = (double)sin(angle); } } axis->referenceLUT = 0; if (reverseBluesteinMultiUpload == 1) { axis->bufferLUT = FFTPlan->axes[axis_id][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { axis->bufferLUT = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { uint64_t checkRadixOrder = 1; for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) if (FFTPlan->axes[0][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; if (checkRadixOrder) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 0) { for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) { if (FFTPlan->axes[0][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0; } } } } if (checkRadixOrder && ((axis_id == 1) || (axis_id == 2)) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full) && (FFTPlan->numAxisUploads[axis_id] == 1) && (axis->specializationConstants.fft_dim_full < maxSingleSizeStrided / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis_id] == app->configuration.size[0]))) { axis->bufferLUT = FFTPlan->axes[0][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = FFTPlan->axes[0][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { checkRadixOrder = 1; for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) if (FFTPlan->axes[1][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; if (checkRadixOrder) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 0) { for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) { if (FFTPlan->axes[1][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0; } } } } if (checkRadixOrder && (axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->axes[1][0].specializationConstants.fft_dim_full) && ((!app->configuration.performDCT) || (app->configuration.size[2] == app->configuration.size[1]))) { axis->bufferLUT = FFTPlan->axes[1][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = FFTPlan->axes[1][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = FFTPlan->axes[1][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { #if(VKFFT_BACKEND==0) resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==1) res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != cudaSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==2) res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != hipSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==3) axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); if (res != CL_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==4) ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); if (res != ZE_RESULT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==5) axis->bufferLUT = app->configuration.device->newBuffer(axis->bufferLUTSize, MTL::ResourceStorageModePrivate); resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #endif } } } } free(tempLUT); tempLUT = 0; } else { double double_PI = 3.14159265358979323846264338327950288419716939937510; if (axis_upload_id > 0) { if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT += axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim; axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(float); } else { if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); if (app->configuration.useLUT_4step == 1) axis->specializationConstants.startDCT3LUT += axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim; axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (axis->specializationConstants.fftDim / 4 + 2)); axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(float); } else axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(float); } if (app->configuration.useLUT_4step == 1) axis->bufferLUTSize += axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim * 2 * sizeof(float); } else { if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(float); } else { if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { axis->specializationConstants.startDCT3LUT = (maxStageSum); axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2)); axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(float); } else axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(float); } } if (axis->specializationConstants.useRader) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (!axis->specializationConstants.inline_rader_kernel) { axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT = axis->bufferLUTSize / (2 * sizeof(float)); axis->bufferLUTSize += (axis->specializationConstants.raderContainer[i].prime - 1) * 2 * sizeof(float); } } } if (axis->bufferLUTSize == 0) axis->bufferLUTSize = sizeof(float); float* tempLUT = (float*)malloc(axis->bufferLUTSize); if (!tempLUT) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } uint64_t localStageSize = axis->specializationConstants.stageRadix[0]; uint64_t localStageSum = 0; for (uint64_t i = 1; i < axis->specializationConstants.numStages; i++) { if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) { for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum)] = (float)cos(j * double_PI / localStageSize / pow(2, k)); tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k)); } localStageSum += localStageSize; } } else if (axis->specializationConstants.rader_generator[i] > 0) { for (uint64_t j = 0; j < localStageSize; j++) { for (int64_t k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) { tempLUT[2 * (k + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); tempLUT[2 * (k + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); } localStageSum += (axis->specializationConstants.stageRadix[i]); } } else { for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); } localStageSum += localStageSize; } } localStageSize *= axis->specializationConstants.stageRadix[i]; } if (axis->specializationConstants.useRader) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type) { if (!axis->specializationConstants.inline_rader_kernel) { for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later uint64_t g_pow = 1; for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; } tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (float)(cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (float)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); } } } else { localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[0]; localStageSum = 0; for (uint64_t l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) { if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)cos(j * double_PI / localStageSize / pow(2, k)); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k)); } localStageSum += localStageSize; } } else { for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); } localStageSum += localStageSize; } } localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; } localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[axis->specializationConstants.raderContainer[i].numStages - 1]; localStageSum = 0; for (int64_t l = (int64_t)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) { if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)cos(j * double_PI / localStageSize / pow(2, k)); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k)); } localStageSum += localStageSize; } } else { for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { for (uint64_t j = 0; j < localStageSize; j++) { tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); } localStageSum += localStageSize; } } localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; } if (!axis->specializationConstants.inline_rader_kernel) { float* raderFFTkernel = (float*)axis->specializationConstants.raderContainer[i].raderFFTkernel; for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (float)(raderFFTkernel[2 * j] / (axis->specializationConstants.raderContainer[i].prime - 1)); tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (float)(raderFFTkernel[2 * j + 1] / (axis->specializationConstants.raderContainer[i].prime - 1)); } } } } } if ((axis_upload_id > 0) && (app->configuration.useLUT_4step == 1)) { for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) { for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) { double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim)); tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = (float)cos(angle); tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = (float)sin(angle); } } } if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2 + 2; j++) { double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id])) * j; tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle); tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle); } } if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { for (uint64_t j = 0; j < app->configuration.size[axis_id] / 4 + 2; j++) { double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id] / 2)) * j; tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle); tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle); } for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2; j++) { double angle = (-double_PI / 8.0 / (double)(app->configuration.size[axis_id] / 2)) * (2 * j + 1); tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = (float)cos(angle); tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = (float)sin(angle); } } axis->referenceLUT = 0; if (reverseBluesteinMultiUpload == 1) { axis->bufferLUT = FFTPlan->axes[axis_id][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { axis->bufferLUT = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { uint64_t checkRadixOrder = 1; for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) if (FFTPlan->axes[0][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; if (checkRadixOrder) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 0) { for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) { if (FFTPlan->axes[0][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0; } } } } if (checkRadixOrder && ((axis_id == 1) || (axis_id == 2)) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full) && (FFTPlan->numAxisUploads[axis_id] == 1) && (axis->specializationConstants.fft_dim_full < maxSingleSizeStrided / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis_id] == app->configuration.size[0]))) { axis->bufferLUT = FFTPlan->axes[0][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = FFTPlan->axes[0][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { checkRadixOrder = 1; for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) if (FFTPlan->axes[1][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; if (checkRadixOrder) { for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 0) { for (uint64_t k = 0; k < axis->specializationConstants.raderContainer[i].numStages; k++) { if (FFTPlan->axes[1][0].specializationConstants.raderContainer[i].stageRadix[k] != axis->specializationConstants.raderContainer[i].stageRadix[k]) checkRadixOrder = 0; } } } } if (checkRadixOrder && (axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->axes[1][0].specializationConstants.fft_dim_full) && ((!app->configuration.performDCT) || (app->configuration.size[2] == app->configuration.size[1]))) { axis->bufferLUT = FFTPlan->axes[1][axis_upload_id].bufferLUT; #if(VKFFT_BACKEND==0) axis->bufferLUTDeviceMemory = FFTPlan->axes[1][axis_upload_id].bufferLUTDeviceMemory; #endif axis->bufferLUTSize = FFTPlan->axes[1][axis_upload_id].bufferLUTSize; axis->referenceLUT = 1; } else { #if(VKFFT_BACKEND==0) resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==1) res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != cudaSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==2) res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); if (res != hipSuccess) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==3) axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); if (res != CL_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==4) ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); if (res != ZE_RESULT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==5) axis->bufferLUT = app->configuration.device->newBuffer(axis->bufferLUTSize, MTL::ResourceStorageModePrivate); resFFT = VkFFT_transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempLUT); tempLUT = 0; return resFFT; } #endif } } } } free(tempLUT); tempLUT = 0; } } if (axis->specializationConstants.useRaderMult) axis->specializationConstants.additionalRaderSharedSize = (axis->specializationConstants.useRaderMult - 1); //allocate RaderUintLUT if (axis->specializationConstants.raderUintLUT) { if (app->bufferRaderUintLUT[axis_id][axis_upload_id] == 0) { app->bufferRaderUintLUTSize[axis_id][axis_upload_id] = 0; for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { app->bufferRaderUintLUTSize[axis_id][axis_upload_id] += axis->specializationConstants.raderContainer[i].prime * sizeof(uint32_t); } uint32_t* tempRaderUintLUT = (uint32_t*)malloc(app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (!tempRaderUintLUT) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } uint64_t current_offset = 0; for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].prime > 0) { axis->specializationConstants.raderContainer[i].raderUintLUToffset = current_offset; uint64_t g_pow = 1; tempRaderUintLUT[current_offset] = 1; current_offset++; for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1; t++) { g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; tempRaderUintLUT[current_offset] = (uint32_t)g_pow; current_offset++; } } } #if(VKFFT_BACKEND==0) resFFT = allocateFFTBuffer(app, &app->bufferRaderUintLUT[axis_id][axis_upload_id], &app->bufferRaderUintLUTDeviceMemory[axis_id][axis_upload_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return resFFT; } resFFT = VkFFT_transferDataFromCPU(app, tempRaderUintLUT, &app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==1) res = cudaMalloc((void**)&app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (res != cudaSuccess) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempRaderUintLUT, &app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==2) res = hipMalloc((void**)&app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (res != hipSuccess) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempRaderUintLUT, &app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==3) app->bufferRaderUintLUT[axis_id][axis_upload_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, app->bufferRaderUintLUTSize[axis_id][axis_upload_id], tempRaderUintLUT, &res); if (res != CL_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } #elif(VKFFT_BACKEND==4) ze_device_mem_alloc_desc_t device_desc = {}; device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; res = zeMemAllocDevice(app->configuration.context[0], &device_desc, app->bufferRaderUintLUTSize[axis_id][axis_upload_id], sizeof(uint32_t), app->configuration.device[0], &app->bufferRaderUintLUT[axis_id][axis_upload_id]); if (res != ZE_RESULT_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return VKFFT_ERROR_FAILED_TO_ALLOCATE; } resFFT = VkFFT_transferDataFromCPU(app, tempRaderUintLUT, &app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return resFFT; } #elif(VKFFT_BACKEND==5) app->bufferRaderUintLUT[axis_id][axis_upload_id] = app->configuration.device->newBuffer(app->bufferRaderUintLUTSize[axis_id][axis_upload_id], MTL::ResourceStorageModePrivate); resFFT = VkFFT_transferDataFromCPU(app, tempRaderUintLUT, &app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); free(tempRaderUintLUT); tempRaderUintLUT = 0; return resFFT; } #endif free(tempRaderUintLUT); tempRaderUintLUT = 0; } else { uint64_t current_offset = 0; for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].prime > 0) { axis->specializationConstants.raderContainer[i].raderUintLUToffset = current_offset; uint64_t g_pow = 1; current_offset += axis->specializationConstants.raderContainer[i].prime; } } } axis->bufferRaderUintLUT = app->bufferRaderUintLUT[axis_id][axis_upload_id]; #if(VKFFT_BACKEND==0) axis->bufferRaderUintLUTDeviceMemory = app->bufferRaderUintLUTDeviceMemory[axis_id][axis_upload_id]; #endif axis->bufferRaderUintLUTSize = app->bufferRaderUintLUTSize[axis_id][axis_upload_id]; } //configure strides uint64_t* axisStride = axis->specializationConstants.inputStride; uint64_t* usedStride = app->configuration.bufferStride; if ((!inverse) && (axis_id == app->firstAxis) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted)) usedStride = app->configuration.inputBufferStride; if ((inverse) && (axis_id == app->lastAxis) && ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((app->useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0)) || (!app->useBluesteinFFT[axis_id])) && (!app->configuration.performConvolution)) && (app->configuration.isInputFormatted) && (!app->configuration.inverseReturnToInputBuffer)) usedStride = app->configuration.inputBufferStride; axisStride[0] = 1; if (axis_id == 0) { axisStride[1] = usedStride[0]; axisStride[2] = usedStride[1]; } if (axis_id == 1) { axisStride[1] = usedStride[0]; axisStride[2] = usedStride[1]; } if (axis_id == 2) { axisStride[1] = usedStride[1]; axisStride[2] = usedStride[0]; } axisStride[3] = usedStride[2]; axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)))) { axisStride[0] = 1; if (axis_id == 0) { axisStride[1] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axisStride[1]) ? FFTPlan->actualFFTSizePerAxis[axis_id][0] : axisStride[1]; axisStride[2] = (axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] > axisStride[2]) ? axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] : axisStride[2]; } if (axis_id == 1) { axisStride[1] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axisStride[1]) ? FFTPlan->actualFFTSizePerAxis[axis_id][0] : axisStride[1]; axisStride[2] = (axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] > axisStride[2]) ? axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] : axisStride[2]; } if (axis_id == 2) { axisStride[2] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axisStride[2]) ? FFTPlan->actualFFTSizePerAxis[axis_id][0] : axisStride[2]; axisStride[1] = (axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][1] > axisStride[1]) ? axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][1] : axisStride[1]; } axisStride[3] = (axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2] > axisStride[3]) ? axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2] : axisStride[3]; axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; } if ((!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0) && (axis->specializationConstants.performR2C || FFTPlan->multiUploadR2C) && (!(app->configuration.isInputFormatted))) { axisStride[1] *= 2; axisStride[2] *= 2; axisStride[3] *= 2; axisStride[4] *= 2; } if ((FFTPlan->multiUploadR2C) && (!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) { for (uint64_t i = 1; i < 5; i++) { axisStride[i] /= 2; } } axisStride = axis->specializationConstants.outputStride; usedStride = app->configuration.bufferStride; if ((!inverse) && (axis_id == app->lastAxis) && (axis_upload_id == 0) && (app->configuration.isOutputFormatted)) usedStride = app->configuration.outputBufferStride; if ((inverse) && (axis_id == app->firstAxis) && (((axis_upload_id == 0) && (!app->configuration.performConvolution)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (app->configuration.performConvolution)))) && ((app->configuration.isOutputFormatted))) usedStride = app->configuration.outputBufferStride; if ((inverse) && (axis_id == app->firstAxis) && (((axis_upload_id == 0) && (app->configuration.isInputFormatted)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.reorderFourStep))) && (app->configuration.inverseReturnToInputBuffer)) usedStride = app->configuration.inputBufferStride; axisStride[0] = 1; if (axis_id == 0) { axisStride[1] = usedStride[0]; axisStride[2] = usedStride[1]; } if (axis_id == 1) { axisStride[1] = usedStride[0]; axisStride[2] = usedStride[1]; } if (axis_id == 2) { axisStride[1] = usedStride[1]; axisStride[2] = usedStride[0]; } axisStride[3] = usedStride[2]; axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 1)))) { axisStride[0] = 1; if (axis_id == 0) { axisStride[1] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axisStride[1]) ? FFTPlan->actualFFTSizePerAxis[axis_id][0] : axisStride[1]; axisStride[2] = (axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] > axisStride[2]) ? axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] : axisStride[2]; } if (axis_id == 1) { axisStride[1] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axisStride[1]) ? FFTPlan->actualFFTSizePerAxis[axis_id][0] : axisStride[1]; axisStride[2] = (axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] > axisStride[2]) ? axisStride[1] * FFTPlan->actualFFTSizePerAxis[axis_id][1] : axisStride[2]; } if (axis_id == 2) { axisStride[2] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axisStride[2]) ? FFTPlan->actualFFTSizePerAxis[axis_id][0] : axisStride[2]; axisStride[1] = (axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][1] > axisStride[1]) ? axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][1] : axisStride[1]; } axisStride[3] = (axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2] > axisStride[3]) ? axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2] : axisStride[3]; axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; } if ((inverse) && (axis_id == 0) && (((!app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) || ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)))) && (axis->specializationConstants.performR2C || FFTPlan->multiUploadR2C) && (!((app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer))) && (!app->configuration.isOutputFormatted)) { axisStride[1] *= 2; axisStride[2] *= 2; axisStride[3] *= 2; axisStride[4] *= 2; } if ((FFTPlan->multiUploadR2C) && (inverse) && (axis_id == 0) && (((!app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) || ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))))) { for (uint64_t i = 1; i < 5; i++) { axisStride[i] /= 2; } } /*axis->specializationConstants.inputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.inputStride[3]; axis->specializationConstants.outputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.outputStride[3]; axis->specializationConstants.inputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.inputStride[3] * app->configuration.coordinateFeatures; axis->specializationConstants.outputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.outputStride[3] * app->configuration.coordinateFeatures; */ uint64_t storageComplexSize; if (app->configuration.doublePrecision) storageComplexSize = (2 * sizeof(double)); else if (app->configuration.halfPrecision) storageComplexSize = (2 * 2); else storageComplexSize = (2 * sizeof(float)); uint64_t initPageSize = -1; uint64_t locBufferNum = 1; uint64_t locBufferSize = -1; /*for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { initPageSize += app->configuration.bufferSize[i]; }*/ /*if (app->configuration.performConvolution) { uint64_t initPageSizeKernel = 0; for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { initPageSizeKernel += app->configuration.kernelSize[i]; } if (initPageSizeKernel > initPageSize) initPageSize = initPageSizeKernel; } if (axis_id == 0) { if ((!((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) && (axis->specializationConstants.inputStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { initPageSize = app->configuration.localPageSize * 1024; } } if (axis_id == 1) { if ((app->configuration.bufferStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { initPageSize = app->configuration.localPageSize * 1024; } } if (axis_id == 2) { if ((app->configuration.bufferStride[2] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { initPageSize = app->configuration.localPageSize * 1024; } } */ if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( ((axis_id == app->firstAxis) && (!inverse)) || ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) ) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.inputBufferNum; if (app->configuration.inputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { totalSize += app->configuration.inputBufferSize[i]; if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i]; } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; } else { if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.outputBufferNum; if (app->configuration.outputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { totalSize += app->configuration.outputBufferSize[i]; if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } else { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) { locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } } else { locBufferNum = app->configuration.tempBufferNum; if (app->configuration.tempBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) { totalSize += app->configuration.tempBufferSize[i]; if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i]; } } } } else { locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } } axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; } } initPageSize = -1; locBufferNum = 1; locBufferSize = -1; if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) )) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( ((axis_id == app->firstAxis) && (inverse)) || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) )) || ((app->configuration.numberKernels > 1) && ( (inverse) || (axis_id == app->lastAxis))) ) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.outputBufferNum; if (app->configuration.outputBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { totalSize += app->configuration.outputBufferSize[i]; if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; } } axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } else { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) { locBufferNum = app->configuration.tempBufferNum; if (app->configuration.tempBufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) { totalSize += app->configuration.tempBufferSize[i]; if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i]; } } } else { locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } } } else { locBufferNum = app->configuration.bufferNum; if (app->configuration.bufferSize) { locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { totalSize += app->configuration.bufferSize[i]; if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; } } } axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; } if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1; if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1; if (app->configuration.performConvolution) { uint64_t totalSize = 0; uint64_t locPageSize = initPageSize; locBufferNum = app->configuration.kernelNum; if (app->configuration.kernelSize) { locBufferSize = (uint64_t)ceil(app->configuration.kernelSize[0] / (double)storageComplexSize); for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { totalSize += app->configuration.kernelSize[i]; if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i]; } } axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize)); //if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1; } else { axis->specializationConstants.kernelBlockSize = 0; axis->specializationConstants.kernelBlockNum = 0; } axis->numBindings = 2; axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum; axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum; axis->specializationConstants.numBuffersBound[2] = 0; axis->specializationConstants.numBuffersBound[3] = 0; #if(VKFFT_BACKEND==0) VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER }; descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.inputBufferBlockNum + axis->specializationConstants.outputBufferBlockNum); #endif axis->specializationConstants.convolutionBindingID = -1; if ((axis_id == 0) && (axis_upload_id == 0) && (app->configuration.FFTdim == 1) && (app->configuration.performConvolution)) { axis->specializationConstants.convolutionBindingID = axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; #endif axis->numBindings++; } if ((axis_id == 1) && (axis_upload_id == 0) && (app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) { axis->specializationConstants.convolutionBindingID = axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; #endif axis->numBindings++; } if ((axis_id == 2) && (axis_upload_id == 0) && (app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) { axis->specializationConstants.convolutionBindingID = axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; #endif axis->numBindings++; } if (app->configuration.useLUT == 1) { axis->specializationConstants.LUTBindingID = axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount++; #endif axis->numBindings++; } if (axis->specializationConstants.raderUintLUT) { axis->specializationConstants.RaderUintLUTBindingID = axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount++; #endif axis->numBindings++; } if ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) { if (axis->specializationConstants.inverseBluestein) axis->bufferBluesteinFFT = &app->bufferBluesteinIFFT[axis_id]; else axis->bufferBluesteinFFT = &app->bufferBluesteinFFT[axis_id]; axis->specializationConstants.BluesteinConvolutionBindingID = axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount++; #endif axis->numBindings++; } if ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == (FFTPlan->numAxisUploads[axis_id] - 1))) { axis->bufferBluestein = &app->bufferBluestein[axis_id]; axis->specializationConstants.BluesteinMultiplicationBindingID = axis->numBindings; axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; #if(VKFFT_BACKEND==0) descriptorPoolSize.descriptorCount++; #endif axis->numBindings++; } #if(VKFFT_BACKEND==0) VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; descriptorPoolCreateInfo.poolSizeCount = 1; descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize; descriptorPoolCreateInfo.maxSets = 1; res = vkCreateDescriptorPool(app->configuration.device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL; } const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings; descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * sizeof(VkDescriptorSetLayoutBinding)); if (!descriptorSetLayoutBindings) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } for (uint64_t i = 0; i < axis->numBindings; ++i) { descriptorSetLayoutBindings[i].binding = (uint32_t)i; descriptorSetLayoutBindings[i].descriptorType = descriptorType; descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i]; descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; } VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings; descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings; res = vkCreateDescriptorSetLayout(app->configuration.device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT; } free(descriptorSetLayoutBindings); descriptorSetLayoutBindings = 0; VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO }; descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool; descriptorSetAllocateInfo.descriptorSetCount = 1; descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout; res = vkAllocateDescriptorSets(app->configuration.device[0], &descriptorSetAllocateInfo, &axis->descriptorSet); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS; } #endif if (app->configuration.specifyOffsetsAtLaunch) { axis->specializationConstants.performPostCompilationInputOffset = 1; axis->specializationConstants.performPostCompilationOutputOffset = 1; if (app->configuration.performConvolution) axis->specializationConstants.performPostCompilationKernelOffset = 1; } resFFT = VkFFTCheckUpdateBufferSet(app, axis, 1, 0); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } resFFT = VkFFTUpdateBufferSet(app, FFTPlan, axis, axis_id, axis_upload_id, inverse); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } { uint64_t maxBatchCoalesced = app->configuration.coalescedMemory / complexSize; axis->groupedBatch = maxBatchCoalesced; /*if ((FFTPlan->actualFFTSizePerAxis[axis_id][0] < 4096) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] < 512) && (FFTPlan->actualFFTSizePerAxis[axis_id][2] == 1)) { if (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) { if (1024 / axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim) { if (1024 / axis->specializationConstants.fftDim > axis->groupedBatch) axis->groupedBatch = 1024 / axis->specializationConstants.fftDim; else axis->groupedBatch = maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim; } } } else { axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch; }*/ //if (axis->groupedBatch * (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0) < app->configuration.warpSize) axis->groupedBatch = app->configuration.warpSize / (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0); //axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch; if (((FFTPlan->numAxisUploads[axis_id] == 1) && (axis_id == 0)) || ((axis_id == 0) && (!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) { axis->groupedBatch = (maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim > axis->groupedBatch) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch; } else { axis->groupedBatch = (maxSingleSizeStrided / axis->specializationConstants.fftDim > 1) ? maxSingleSizeStrided / axis->specializationConstants.fftDim * axis->groupedBatch : axis->groupedBatch; } //axis->groupedBatch = 8; //shared memory bank conflict resolve //#if(VKFFT_BACKEND!=2)//for some reason, hip doesn't get performance increase from having variable shared memory strides. if (app->configuration.vendorID == 0x10DE) { if (FFTPlan->numAxisUploads[axis_id] == 2) { if ((axis_upload_id > 0) || (axis->specializationConstants.fftDim <= 512)) { if (axis->specializationConstants.fftDim * (64 / complexSize) <= maxSequenceLengthSharedMemory) { axis->groupedBatch = 64 / complexSize; maxBatchCoalesced = 64 / complexSize; } if (axis->specializationConstants.fftDim * (128 / complexSize) <= maxSequenceLengthSharedMemory) { axis->groupedBatch = 128 / complexSize; maxBatchCoalesced = 128 / complexSize; } } } //#endif if (FFTPlan->numAxisUploads[axis_id] == 3) { if (axis->specializationConstants.fftDim * (64 / complexSize) <= maxSequenceLengthSharedMemory) { axis->groupedBatch = 64 / complexSize; maxBatchCoalesced = 64 / complexSize; } if (axis->specializationConstants.fftDim * (128 / complexSize) <= maxSequenceLengthSharedMemory) { axis->groupedBatch = 128 / complexSize; maxBatchCoalesced = 128 / complexSize; } } } else { if ((FFTPlan->numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) { axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0); } //#endif if ((FFTPlan->numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / (2 * complexSize))) { axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0); } } if (axis->groupedBatch < maxBatchCoalesced) axis->groupedBatch = maxBatchCoalesced; axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced; //half bandiwdth technique if (!((axis_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)) && (axis->specializationConstants.fftDim > maxSingleSizeStrided)) { axis->groupedBatch = maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim; if (axis->groupedBatch == 0) axis->groupedBatch = 1; } if ((app->configuration.halfThreads) && (axis->groupedBatch * axis->specializationConstants.fftDim * complexSize >= app->configuration.sharedMemorySize)) axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0); if (axis->groupedBatch > app->configuration.warpSize) axis->groupedBatch = (axis->groupedBatch / app->configuration.warpSize) * app->configuration.warpSize; if (axis->groupedBatch > 2 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (2 * maxBatchCoalesced)) * (2 * maxBatchCoalesced); if (axis->groupedBatch > 4 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (4 * maxBatchCoalesced)) * (4 * maxBatchCoalesced); //uint64_t maxThreadNum = (axis_id) ? (maxSingleSizeStrided * app->configuration.coalescedMemory / complexSize) / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost) : maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost); //if (maxThreadNum > app->configuration.maxThreadsNum) maxThreadNum = app->configuration.maxThreadsNum; uint64_t maxThreadNum = app->configuration.maxThreadsNum; axis->specializationConstants.axisSwapped = 0; uint64_t r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1; if (axis_id == 0) { if (axis_upload_id == 0) { axis->axisBlock[0] = (((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1; if (axis->specializationConstants.useRaderMult) { uint64_t locMaxBatchCoalesced = ((axis_id == 0) && (((axis_upload_id == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (axis->specializationConstants.numAxisUploads == 1))) ? 1 : maxBatchCoalesced; uint64_t final_rader_thread_count = 0; for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 1) { uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); if (active_rader > 1) { if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * locMaxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; } uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[0] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; } } axis->axisBlock[0] = final_rader_thread_count; if (axis->axisBlock[0] * axis->groupedBatch > maxThreadNum) axis->groupedBatch = locMaxBatchCoalesced; } if (axis->specializationConstants.useRaderFFT) { if (axis->axisBlock[0] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[0] = axis->specializationConstants.minRaderFFTThreadNum; } if (axis->axisBlock[0] > maxThreadNum) axis->axisBlock[0] = maxThreadNum; if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; if (axis->specializationConstants.reorderFourStep && (FFTPlan->numAxisUploads[axis_id] > 1)) axis->axisBlock[1] = axis->groupedBatch; else { //axis->axisBlock[1] = (axis->axisBlock[0] < app->configuration.warpSize) ? app->configuration.warpSize / axis->axisBlock[0] : 1; uint64_t estimate_batch = (((axis->axisBlock[0] / app->configuration.warpSize) == 1) && ((axis->axisBlock[0] / (double)app->configuration.warpSize) < 1.5)) ? app->configuration.aimThreads / app->configuration.warpSize : app->configuration.aimThreads / axis->axisBlock[0]; if (estimate_batch == 0) estimate_batch = 1; axis->axisBlock[1] = ((axis->axisBlock[0] < app->configuration.aimThreads) && ((axis->axisBlock[0] < app->configuration.warpSize) || (axis->specializationConstants.useRader))) ? estimate_batch : 1; } uint64_t currentAxisBlock1 = axis->axisBlock[1]; for (uint64_t i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) { if (((FFTPlan->numAxisUploads[0] > 1) && (!(((FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) % axis->axisBlock[1]) == 0))) || ((FFTPlan->numAxisUploads[0] == 1) && (!(((FFTPlan->actualFFTSizePerAxis[axis_id][1] / r2cmult) % axis->axisBlock[1]) == 0)))) { if (i * axis->specializationConstants.fftDim * complexSize <= allowedSharedMemory) axis->axisBlock[1] = i; i = 2 * currentAxisBlock1; } } if (((axis->specializationConstants.fftDim % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim < maxSequenceLengthSharedMemoryPow2) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) { //we plan to swap - this reduces bank conflicts axis->axisBlock[1] = (uint64_t)pow(2, (uint64_t)ceil(log2((double)axis->axisBlock[1]))); } if ((FFTPlan->numAxisUploads[0] > 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim); if ((axis->specializationConstants.mergeSequencesR2C != 0) && (axis->specializationConstants.fftDim * axis->axisBlock[1] >= maxSequenceLengthSharedMemory)) { axis->specializationConstants.mergeSequencesR2C = 0; /*if ((!inverse) && (axis_id == 0) && (axis_upload_id == 0) && (!(app->configuration.isInputFormatted))) { axis->specializationConstants.inputStride[1] /= 2; axis->specializationConstants.inputStride[2] /= 2; axis->specializationConstants.inputStride[3] /= 2; axis->specializationConstants.inputStride[4] /= 2; } if ((inverse) && (axis_id == 0) && (axis_upload_id == 0) && (!((app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer))) && (!app->configuration.isOutputFormatted)) { axis->specializationConstants.outputStride[1] /= 2; axis->specializationConstants.outputStride[2] /= 2; axis->specializationConstants.outputStride[3] /= 2; axis->specializationConstants.outputStride[4] /= 2; }*/ r2cmult = 1; } if ((FFTPlan->numAxisUploads[0] == 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult); if (app->configuration.vendorID == 0x10DE) { while ((axis->axisBlock[1] * axis->axisBlock[0] >= 2 * app->configuration.aimThreads) && (axis->axisBlock[1] > maxBatchCoalesced)) { axis->axisBlock[1] /= 2; if (axis->axisBlock[1] < maxBatchCoalesced) axis->axisBlock[1] = maxBatchCoalesced; } } if (axis->axisBlock[1] > app->configuration.maxComputeWorkGroupSize[1]) axis->axisBlock[1] = app->configuration.maxComputeWorkGroupSize[1]; //if (axis->axisBlock[0] * axis->axisBlock[1] > app->configuration.maxThreadsNum) axis->axisBlock[1] /= 2; if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { for (uint64_t i = 1; i <= axis->axisBlock[1]; i++) { if ((axis->axisBlock[1] / i) * axis->axisBlock[0] <= maxThreadNum) { axis->axisBlock[1] /= i; i = axis->axisBlock[1] + 1; } } } while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2; if (((axis->specializationConstants.fftDim % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) { /*#if (VKFFT_BACKEND==0) if (((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) != 0)) { uint64_t temp = axis->axisBlock[1]; axis->axisBlock[1] = axis->axisBlock[0]; axis->axisBlock[0] = temp; axis->specializationConstants.axisSwapped = 1; } #else*/ uint64_t temp = axis->axisBlock[1]; axis->axisBlock[1] = axis->axisBlock[0]; axis->axisBlock[0] = temp; axis->specializationConstants.axisSwapped = 1; //#endif } axis->axisBlock[2] = 1; axis->axisBlock[3] = axis->specializationConstants.fftDim; } else { axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? (uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost : 1; if (axis->specializationConstants.useRaderMult) { uint64_t final_rader_thread_count = 0; for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 1) { uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); if (active_rader > 1) { if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; } uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; } } axis->axisBlock[1] = final_rader_thread_count; if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; } if (axis->specializationConstants.useRaderFFT) { if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum; } uint64_t scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch; if ((scale > 1) && ((axis->specializationConstants.fftDim * axis->groupedBatch * scale <= maxSequenceLengthSharedMemory))) axis->groupedBatch *= scale; axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize; if (app->configuration.vendorID == 0x10DE) { while ((axis->axisBlock[1] * axis->axisBlock[0] >= 2 * app->configuration.aimThreads) && (axis->axisBlock[0] > maxBatchCoalesced)) { axis->axisBlock[0] /= 2; if (axis->axisBlock[0] < maxBatchCoalesced) axis->axisBlock[0] = maxBatchCoalesced; } } if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) { if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum) { axis->axisBlock[0] /= i; i = axis->axisBlock[0] + 1; } } } axis->axisBlock[2] = 1; axis->axisBlock[3] = axis->specializationConstants.fftDim; } } if (axis_id == 1) { axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1; if (axis->specializationConstants.useRaderMult) { uint64_t final_rader_thread_count = 0; for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 1) { uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); if (active_rader > 1) { if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; } uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; } } axis->axisBlock[1] = final_rader_thread_count; if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; } if (axis->specializationConstants.useRaderFFT) { if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum; } if (axis->groupedBatch * axis->axisBlock[1] < axis->specializationConstants.warpSize) { axis->groupedBatch = axis->specializationConstants.warpSize / axis->axisBlock[1]; if (axis->groupedBatch == 0) axis->groupedBatch = 1; } axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0]; if (app->configuration.vendorID == 0x10DE) { while ((axis->axisBlock[1] * axis->axisBlock[0] >= 2 * app->configuration.aimThreads) && (axis->axisBlock[0] > maxBatchCoalesced)) { axis->axisBlock[0] /= 2; if (axis->axisBlock[0] < maxBatchCoalesced) axis->axisBlock[0] = maxBatchCoalesced; } } if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) { if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum) { axis->axisBlock[0] /= i; i = axis->axisBlock[0] + 1; } } } axis->axisBlock[2] = 1; axis->axisBlock[3] = axis->specializationConstants.fftDim; } if (axis_id == 2) { axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1; if (axis->specializationConstants.useRaderMult) { uint64_t final_rader_thread_count = 0; for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { if (axis->specializationConstants.raderContainer[i].type == 1) { uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); if (active_rader > 1) { if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; } uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; } } axis->axisBlock[1] = final_rader_thread_count; if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; } if (axis->specializationConstants.useRaderFFT) { if (axis->axisBlock[1] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[1] = axis->specializationConstants.minRaderFFTThreadNum; } if (axis->groupedBatch * axis->axisBlock[1] < axis->specializationConstants.warpSize) { axis->groupedBatch = axis->specializationConstants.warpSize / axis->axisBlock[1]; if (axis->groupedBatch == 0) axis->groupedBatch = 1; } axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0]; if (app->configuration.vendorID == 0x10DE) { while ((axis->axisBlock[1] * axis->axisBlock[0] >= 2 * app->configuration.aimThreads) && (axis->axisBlock[0] > maxBatchCoalesced)) { axis->axisBlock[0] /= 2; if (axis->axisBlock[0] < maxBatchCoalesced) axis->axisBlock[0] = maxBatchCoalesced; } } if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) { if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum) { axis->axisBlock[0] /= i; i = axis->axisBlock[0] + 1; } } } axis->axisBlock[2] = 1; axis->axisBlock[3] = axis->specializationConstants.fftDim; } /*VkSpecializationMapEntry specializationMapEntries[36] = { {} }; for (uint64_t i = 0; i < 36; i++) { specializationMapEntries[i].constantID = i + 1; specializationMapEntries[i].size = sizeof(uint64_t); specializationMapEntries[i].offset = i * sizeof(uint64_t); } VkSpecializationInfo specializationInfo = { 0 }; specializationInfo.dataSize = 36 * sizeof(uint64_t); specializationInfo.mapEntryCount = 36; specializationInfo.pMapEntries = specializationMapEntries;*/ axis->specializationConstants.localSize[0] = axis->axisBlock[0]; axis->specializationConstants.localSize[1] = axis->axisBlock[1]; axis->specializationConstants.localSize[2] = axis->axisBlock[2]; axis->specializationConstants.numSubgroups = (uint64_t)ceil(axis->axisBlock[0] * axis->axisBlock[1] * axis->axisBlock[2] / (double)app->configuration.warpSize); //specializationInfo.pData = &axis->specializationConstants; //uint64_t registerBoost = (FFTPlan->numAxisUploads[axis_id] > 1) ? app->configuration.registerBoost4Step : app->configuration.registerBoost; axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : app->configuration.coordinateFeatures; axis->specializationConstants.matrixConvolution = app->configuration.matrixConvolution; axis->specializationConstants.numKernels = app->configuration.numberKernels; axis->specializationConstants.sharedMemSize = app->configuration.sharedMemorySize; axis->specializationConstants.sharedMemSizePow2 = app->configuration.sharedMemorySizePow2; axis->specializationConstants.normalize = (reverseBluesteinMultiUpload) ? 1 : app->configuration.normalize; axis->specializationConstants.size[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; axis->specializationConstants.size[1] = FFTPlan->actualFFTSizePerAxis[axis_id][1]; axis->specializationConstants.size[2] = FFTPlan->actualFFTSizePerAxis[axis_id][2]; axis->specializationConstants.axis_id = axis_id; axis->specializationConstants.axis_upload_id = axis_upload_id; for (uint64_t i = 0; i < 3; i++) { axis->specializationConstants.frequencyZeropadding = app->configuration.frequencyZeroPadding; axis->specializationConstants.performZeropaddingFull[i] = app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on) axis->specializationConstants.fft_zeropad_left_full[i] = app->configuration.fft_zeropad_left[i]; axis->specializationConstants.fft_zeropad_right_full[i] = app->configuration.fft_zeropad_right[i]; } if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 0) || (FFTPlan->numAxisUploads[axis_id] == 1))) { axis->specializationConstants.zeropadBluestein[0] = 1; axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = app->configuration.size[axis_id]; if ((FFTPlan->multiUploadR2C) && (axis_id == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2; if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] - 2; if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2; axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; } if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))) { axis->specializationConstants.zeropadBluestein[1] = 1; axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = app->configuration.size[axis_id]; if ((FFTPlan->multiUploadR2C) && (axis_id == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2; if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] - 2; if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2; axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; } uint64_t zeropad_r2c_multiupload_scale = ((axis_id == 0) && (FFTPlan->multiUploadR2C)) ? 2 : 1; if ((inverse)) { if ((app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload != 1)) { axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; } else axis->specializationConstants.zeropad[0] = 0; if ((!app->configuration.frequencyZeroPadding) && (((axis_upload_id == 0) && (!((axis->specializationConstants.useBluesteinFFT) || (app->configuration.performConvolution)))) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)) || (app->configuration.performConvolution)))))) { axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; } else axis->specializationConstants.zeropad[1] = 0; } else { if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload != 1)) { axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; } else axis->specializationConstants.zeropad[0] = 0; if (((app->configuration.frequencyZeroPadding) && (((axis_upload_id == 0) && (!axis->specializationConstants.useBluesteinFFT)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.useBluesteinFFT && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)))))) || (((!app->configuration.frequencyZeroPadding) && (app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1) && (app->configuration.performConvolution)))) { axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; } else axis->specializationConstants.zeropad[1] = 0; } if ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)) { axis->specializationConstants.convolutionStep = 1; } else axis->specializationConstants.convolutionStep = 0; if (app->useBluesteinFFT[axis_id] && (axis_upload_id == 0)) axis->specializationConstants.BluesteinConvolutionStep = 1; else axis->specializationConstants.BluesteinConvolutionStep = 0; if (app->useBluesteinFFT[axis_id] && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) axis->specializationConstants.BluesteinPreMultiplication = 1; else axis->specializationConstants.BluesteinPreMultiplication = 0; if (app->useBluesteinFFT[axis_id] && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))) axis->specializationConstants.BluesteinPostMultiplication = 1; else axis->specializationConstants.BluesteinPostMultiplication = 0; uint64_t tempSize[3] = { FFTPlan->actualFFTSizePerAxis[axis_id][0], FFTPlan->actualFFTSizePerAxis[axis_id][1], FFTPlan->actualFFTSizePerAxis[axis_id][2] }; if (axis_id == 0) { if (axis_upload_id == 0) tempSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[1]; else tempSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[0]; if ((FFTPlan->actualPerformR2CPerAxis[axis_id] == 1) && (axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0); tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches; if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures; //if (app->configuration.performZeropadding[1]) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0); //if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0); } if (axis_id == 1) { tempSize[0] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)axis->specializationConstants.fftDim); tempSize[1] = 1; tempSize[2] = FFTPlan->actualFFTSizePerAxis[axis_id][2]; tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches; if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures; //if (app->configuration.actualPerformR2C == 1) tempSize[0] = (uint64_t)ceil(tempSize[0] / 2.0); //if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0); } if (axis_id == 2) { tempSize[0] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][2] / (double)axis->specializationConstants.fftDim); tempSize[1] = 1; tempSize[2] = FFTPlan->actualFFTSizePerAxis[axis_id][1]; tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches; if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures; //if (app->configuration.actualPerformR2C == 1) tempSize[0] = (uint64_t)ceil(tempSize[0] / 2.0); } if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > tempSize[0]) && (tempSize[1] >= tempSize[2])) { uint64_t temp_tempSize = tempSize[0]; tempSize[0] = tempSize[1]; tempSize[1] = temp_tempSize; axis->specializationConstants.swapComputeWorkGroupID = 1; } else { if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > tempSize[0]) && (tempSize[2] >= tempSize[1])) { uint64_t temp_tempSize = tempSize[0]; tempSize[0] = tempSize[2]; tempSize[2] = temp_tempSize; axis->specializationConstants.swapComputeWorkGroupID = 2; } } if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1; else axis->specializationConstants.performWorkGroupShift[0] = 0; if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1; else axis->specializationConstants.performWorkGroupShift[1] = 0; if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1; else axis->specializationConstants.performWorkGroupShift[2] = 0; char floatTypeInputMemory[10]; char floatTypeOutputMemory[10]; char floatTypeKernelMemory[10]; char floatType[10]; axis->specializationConstants.unroll = 1; axis->specializationConstants.LUT = (app->configuration.useLUT == 1) ? 1 : 0; axis->specializationConstants.LUT_4step = (app->configuration.useLUT_4step == 1) ? 1 : 0; if (app->configuration.doublePrecision) { sprintf(floatType, "double"); sprintf(floatTypeInputMemory, "double"); sprintf(floatTypeOutputMemory, "double"); sprintf(floatTypeKernelMemory, "double"); //axis->specializationConstants.unroll = 1; } else { //axis->specializationConstants.unroll = 0; if (app->configuration.halfPrecision) { sprintf(floatType, "float"); if (app->configuration.halfPrecisionMemoryOnly) { //only out of place mode, input/output buffer must be different sprintf(floatTypeKernelMemory, "float"); if ((axis_id == app->firstAxis) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.actualInverse)) sprintf(floatTypeInputMemory, "half"); else sprintf(floatTypeInputMemory, "float"); if ((axis_id == app->firstAxis) && (axis_upload_id == 0) && (axis->specializationConstants.actualInverse)) sprintf(floatTypeOutputMemory, "half"); else sprintf(floatTypeOutputMemory, "float"); } else { sprintf(floatTypeInputMemory, "half"); sprintf(floatTypeOutputMemory, "half"); sprintf(floatTypeKernelMemory, "half"); } } else { if (app->configuration.doublePrecisionFloatMemory) { sprintf(floatType, "double"); sprintf(floatTypeInputMemory, "float"); sprintf(floatTypeOutputMemory, "float"); sprintf(floatTypeKernelMemory, "float"); } else { sprintf(floatType, "float"); sprintf(floatTypeInputMemory, "float"); sprintf(floatTypeOutputMemory, "float"); sprintf(floatTypeKernelMemory, "float"); } } } char uintType[20] = ""; if (!app->configuration.useUint64) { #if(VKFFT_BACKEND==0) sprintf(uintType, "uint"); #elif(VKFFT_BACKEND==1) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==2) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==3) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==4) sprintf(uintType, "unsigned int"); #elif(VKFFT_BACKEND==5) sprintf(uintType, "uint"); #endif } else { #if(VKFFT_BACKEND==0) sprintf(uintType, "uint64_t"); #elif(VKFFT_BACKEND==1) sprintf(uintType, "unsigned long long"); #elif(VKFFT_BACKEND==2) sprintf(uintType, "unsigned long long"); #elif(VKFFT_BACKEND==3) sprintf(uintType, "unsigned long"); #elif(VKFFT_BACKEND==4) sprintf(uintType, "unsigned long"); #elif(VKFFT_BACKEND==5) sprintf(uintType, "ulong"); #endif } { axis->pushConstants.structSize = 0; if (axis->specializationConstants.performWorkGroupShift[0]) { axis->pushConstants.performWorkGroupShift[0] = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performWorkGroupShift[1]) { axis->pushConstants.performWorkGroupShift[1] = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performWorkGroupShift[2]) { axis->pushConstants.performWorkGroupShift[2] = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performPostCompilationInputOffset) { axis->pushConstants.performPostCompilationInputOffset = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performPostCompilationOutputOffset) { axis->pushConstants.performPostCompilationOutputOffset = 1; axis->pushConstants.structSize += 1; } if (axis->specializationConstants.performPostCompilationKernelOffset) { axis->pushConstants.performPostCompilationKernelOffset = 1; axis->pushConstants.structSize += 1; } if (app->configuration.useUint64) axis->pushConstants.structSize *= sizeof(uint64_t); else axis->pushConstants.structSize *= sizeof(uint32_t); axis->specializationConstants.pushConstantsStructSize = axis->pushConstants.structSize; } //uint64_t LUT = app->configuration.useLUT; uint64_t type = 0; if ((axis_id == 0) && (axis_upload_id == 0)) type = 0; if (axis_id != 0) type = 1; if ((axis_id == 0) && (axis_upload_id > 0)) type = 2; //if ((axis->specializationConstants.fftDim == 8 * maxSequenceLengthSharedMemory) && (app->configuration.registerBoost >= 8)) axis->specializationConstants.registerBoost = 8; if ((axis_id == 0) && (!axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 5; if ((axis_id == 0) && (axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 6; if ((axis_id == 0) && (app->configuration.performDCT == 1)) type = 110; if ((axis_id != 0) && (app->configuration.performDCT == 1)) type = 111; if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 120; if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 121; if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 130; if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 131; if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 142; if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 144; if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 143; if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 145; #if(VKFFT_BACKEND==0) axis->specializationConstants.cacheShuffle = 0; //((FFTPlan->numAxisUploads[axis_id] > 1) && ((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) == 0) && (!app->configuration.doublePrecision) && (!axis->specializationConstants.useBluesteinFFT) && (!app->configuration.doublePrecisionFloatMemory) && ((type == 0) || (type == 5) || (type == 6))) ? 1 : 0; #elif(VKFFT_BACKEND==1) axis->specializationConstants.cacheShuffle = 0; #elif(VKFFT_BACKEND==2) axis->specializationConstants.cacheShuffle = 0; #elif(VKFFT_BACKEND==3) axis->specializationConstants.cacheShuffle = 0; #elif(VKFFT_BACKEND==4) axis->specializationConstants.cacheShuffle = 0; #elif(VKFFT_BACKEND==5) axis->specializationConstants.cacheShuffle = 0; #endif axis->specializationConstants.maxCodeLength = app->configuration.maxCodeLength; axis->specializationConstants.maxTempLength = app->configuration.maxTempLength; axis->specializationConstants.code0 = (char*)malloc(sizeof(char) * app->configuration.maxCodeLength); char* code0 = axis->specializationConstants.code0; if (!code0) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } resFFT = shaderGenVkFFT(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type); freeShaderGenVkFFT(&axis->specializationConstants); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } #if(VKFFT_BACKEND==0) uint32_t* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); } else { glslang_resource_t default_resource = {}; default_resource.max_lights = 32; default_resource.max_clip_planes = 6; default_resource.max_texture_units = 32; default_resource.max_texture_coords = 32; default_resource.max_vertex_attribs = 64; default_resource.max_vertex_uniform_components = 4096; default_resource.max_varying_floats = 64; default_resource.max_vertex_texture_image_units = 32; default_resource.max_combined_texture_image_units = 80; default_resource.max_texture_image_units = 32; default_resource.max_fragment_uniform_components = 4096; default_resource.max_draw_buffers = 32; default_resource.max_vertex_uniform_vectors = 128; default_resource.max_varying_vectors = 8; default_resource.max_fragment_uniform_vectors = 16; default_resource.max_vertex_output_vectors = 16; default_resource.max_fragment_input_vectors = 15; default_resource.min_program_texel_offset = -8; default_resource.max_program_texel_offset = 7; default_resource.max_clip_distances = 8; default_resource.max_compute_work_group_count_x = (int)app->configuration.maxComputeWorkGroupCount[0]; default_resource.max_compute_work_group_count_y = (int)app->configuration.maxComputeWorkGroupCount[1]; default_resource.max_compute_work_group_count_z = (int)app->configuration.maxComputeWorkGroupCount[2]; default_resource.max_compute_work_group_size_x = (int)app->configuration.maxComputeWorkGroupSize[0]; default_resource.max_compute_work_group_size_y = (int)app->configuration.maxComputeWorkGroupSize[1]; default_resource.max_compute_work_group_size_z = (int)app->configuration.maxComputeWorkGroupSize[2]; default_resource.max_compute_uniform_components = 1024; default_resource.max_compute_texture_image_units = 16; default_resource.max_compute_image_uniforms = 8; default_resource.max_compute_atomic_counters = 8; default_resource.max_compute_atomic_counter_buffers = 1; default_resource.max_varying_components = 60; default_resource.max_vertex_output_components = 64; default_resource.max_geometry_input_components = 64; default_resource.max_geometry_output_components = 128; default_resource.max_fragment_input_components = 128; default_resource.max_image_units = 8; default_resource.max_combined_image_units_and_fragment_outputs = 8; default_resource.max_combined_shader_output_resources = 8; default_resource.max_image_samples = 0; default_resource.max_vertex_image_uniforms = 0; default_resource.max_tess_control_image_uniforms = 0; default_resource.max_tess_evaluation_image_uniforms = 0; default_resource.max_geometry_image_uniforms = 0; default_resource.max_fragment_image_uniforms = 8; default_resource.max_combined_image_uniforms = 8; default_resource.max_geometry_texture_image_units = 16; default_resource.max_geometry_output_vertices = 256; default_resource.max_geometry_total_output_components = 1024; default_resource.max_geometry_uniform_components = 1024; default_resource.max_geometry_varying_components = 64; default_resource.max_tess_control_input_components = 128; default_resource.max_tess_control_output_components = 128; default_resource.max_tess_control_texture_image_units = 16; default_resource.max_tess_control_uniform_components = 1024; default_resource.max_tess_control_total_output_components = 4096; default_resource.max_tess_evaluation_input_components = 128; default_resource.max_tess_evaluation_output_components = 128; default_resource.max_tess_evaluation_texture_image_units = 16; default_resource.max_tess_evaluation_uniform_components = 1024; default_resource.max_tess_patch_components = 120; default_resource.max_patch_vertices = 32; default_resource.max_tess_gen_level = 64; default_resource.max_viewports = 16; default_resource.max_vertex_atomic_counters = 0; default_resource.max_tess_control_atomic_counters = 0; default_resource.max_tess_evaluation_atomic_counters = 0; default_resource.max_geometry_atomic_counters = 0; default_resource.max_fragment_atomic_counters = 8; default_resource.max_combined_atomic_counters = 8; default_resource.max_atomic_counter_bindings = 1; default_resource.max_vertex_atomic_counter_buffers = 0; default_resource.max_tess_control_atomic_counter_buffers = 0; default_resource.max_tess_evaluation_atomic_counter_buffers = 0; default_resource.max_geometry_atomic_counter_buffers = 0; default_resource.max_fragment_atomic_counter_buffers = 1; default_resource.max_combined_atomic_counter_buffers = 1; default_resource.max_atomic_counter_buffer_size = 16384; default_resource.max_transform_feedback_buffers = 4; default_resource.max_transform_feedback_interleaved_components = 64; default_resource.max_cull_distances = 8; default_resource.max_combined_clip_and_cull_distances = 8; default_resource.max_samples = 4; default_resource.max_mesh_output_vertices_nv = 256; default_resource.max_mesh_output_primitives_nv = 512; default_resource.max_mesh_work_group_size_x_nv = 32; default_resource.max_mesh_work_group_size_y_nv = 1; default_resource.max_mesh_work_group_size_z_nv = 1; default_resource.max_task_work_group_size_x_nv = 32; default_resource.max_task_work_group_size_y_nv = 1; default_resource.max_task_work_group_size_z_nv = 1; default_resource.max_mesh_view_count_nv = 4; default_resource.limits.non_inductive_for_loops = 1; default_resource.limits.while_loops = 1; default_resource.limits.do_while_loops = 1; default_resource.limits.general_uniform_indexing = 1; default_resource.limits.general_attribute_matrix_vector_indexing = 1; default_resource.limits.general_varying_indexing = 1; default_resource.limits.general_sampler_indexing = 1; default_resource.limits.general_variable_indexing = 1; default_resource.limits.general_constant_matrix_vector_indexing = 1; glslang_target_client_version_t client_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0; glslang_target_language_version_t target_language_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0; glslang_input_t input = { GLSLANG_SOURCE_GLSL, GLSLANG_STAGE_COMPUTE, GLSLANG_CLIENT_VULKAN, client_version, GLSLANG_TARGET_SPV, target_language_version, code0, 450, GLSLANG_NO_PROFILE, 1, 0, GLSLANG_MSG_DEFAULT_BIT, (const glslang_resource_t*)&default_resource, }; //printf("%s\n", code0); glslang_shader_t* shader = glslang_shader_create((const glslang_input_t*)&input); const char* err; if (!glslang_shader_preprocess(shader, &input)) { err = glslang_shader_get_info_log(shader); printf("%s\n", code0); printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); glslang_shader_delete(shader); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SHADER_PREPROCESS; } if (!glslang_shader_parse(shader, &input)) { err = glslang_shader_get_info_log(shader); printf("%s\n", code0); printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); glslang_shader_delete(shader); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SHADER_PARSE; } glslang_program_t* program = glslang_program_create(); glslang_program_add_shader(program, shader); if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT)) { err = glslang_program_get_info_log(program); printf("%s\n", code0); printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); glslang_shader_delete(shader); glslang_program_delete(program); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SHADER_LINK; } glslang_program_SPIRV_generate(program, input.stage); if (glslang_program_SPIRV_get_messages(program)) { printf("%s", glslang_program_SPIRV_get_messages(program)); glslang_shader_delete(shader); glslang_program_delete(program); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_SPIRV_GENERATE; } glslang_shader_delete(shader); uint32_t* tempCode = glslang_program_SPIRV_get_ptr(program); codeSize = glslang_program_SPIRV_get_size(program) * sizeof(uint32_t); axis->binarySize = codeSize; code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; glslang_program_delete(program); deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } axis->binary = code; memcpy(code, tempCode, codeSize); glslang_program_delete(program); } VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO }; createInfo.pCode = code; createInfo.codeSize = codeSize; res = vkCreateShaderModule(app->configuration.device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module); if (res != VK_SUCCESS) { free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; } VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; pipelineLayoutCreateInfo.setLayoutCount = 1; pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout; VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT }; pushConstantRange.offset = 0; pushConstantRange.size = (uint32_t)axis->pushConstants.structSize; // Push constant ranges are part of the pipeline layout if (axis->pushConstants.structSize) { pipelineLayoutCreateInfo.pushConstantRangeCount = 1; pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; } res = vkCreatePipelineLayout(app->configuration.device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT; } pipelineShaderStageCreateInfo.pName = "main"; pipelineShaderStageCreateInfo.pSpecializationInfo = 0;// &specializationInfo; computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo; computePipelineCreateInfo.layout = axis->pipelineLayout; if (app->configuration.pipelineCache) res = vkCreateComputePipelines(app->configuration.device[0], app->configuration.pipelineCache[0], 1, &computePipelineCreateInfo, 0, &axis->pipeline); else res = vkCreateComputePipelines(app->configuration.device[0], 0, 1, &computePipelineCreateInfo, 0, &axis->pipeline); if (res != VK_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE; } vkDestroyShaderModule(app->configuration.device[0], pipelineShaderStageCreateInfo.module, 0); if (!app->configuration.saveApplicationToString) { free(code); code = 0; } #elif(VKFFT_BACKEND==1) char* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); } else { nvrtcProgram prog; nvrtcResult result = nvrtcCreateProgram(&prog, // prog code0, // buffer "VkFFT.cu", // name 0, // numHeaders 0, // headers 0); // includeNames //free(includeNames); //free(headers); if (result != NVRTC_SUCCESS) { printf("nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } #if (CUDA_VERSION >= 11030) char* opts[5]; opts[0] = (char*)malloc(sizeof(char) * 50); if (!opts[0]) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } sprintf(opts[0], "--gpu-architecture=sm_%" PRIu64 "%" PRIu64 "", app->configuration.computeCapabilityMajor, app->configuration.computeCapabilityMinor); //result = nvrtcAddNameExpression(prog, "&consts"); //if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result)); result = nvrtcCompileProgram(prog, // prog 1, // numOptions (const char* const*)opts); // options free(opts[0]); #else result = nvrtcCompileProgram(prog, // prog 0, // numOptions 0); // options #endif if (result != NVRTC_SUCCESS) { printf("nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result)); char* log = (char*)malloc(sizeof(char) * 4000000); if (!log) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } else { nvrtcGetProgramLog(prog, log); printf("%s\n", log); free(log); log = 0; printf("%s\n", code0); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } #if (CUDA_VERSION >= 11030) result = nvrtcGetCUBINSize(prog, &codeSize); #else result = nvrtcGetPTXSize(prog, &codeSize); #endif if (result != NVRTC_SUCCESS) { #if (CUDA_VERSION >= 11030) printf("nvrtcGetCUBINSize error: %s\n", nvrtcGetErrorString(result)); #else printf("nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result)); #endif free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; } axis->binarySize = codeSize; code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } axis->binary = code; #if (CUDA_VERSION >= 11030) result = nvrtcGetCUBIN(prog, code); #else result = nvrtcGetPTX(prog, code); #endif if (result != NVRTC_SUCCESS) { #if (CUDA_VERSION >= 11030) printf("nvrtcGetCUBIN error: %s\n", nvrtcGetErrorString(result)); #else printf("nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result)); #endif free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE; } result = nvrtcDestroyProgram(&prog); if (result != NVRTC_SUCCESS) { printf("nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result)); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; } } CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); if (result2 != CUDA_SUCCESS) { printf("cuModuleLoadDataEx error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; } result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main"); if (result2 != CUDA_SUCCESS) { printf("cuModuleGetFunction error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; } /*result2 = cuFuncSetCacheConfig(axis->VkFFTKernel, CU_FUNC_CACHE_PREFER_SHARED); if (result2 != CUDA_SUCCESS) { printf("cuFuncSetAttribute error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; }*/ if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (int)axis->specializationConstants.usedSharedMemory); if (result2 != CUDA_SUCCESS) { printf("cuFuncSetAttribute error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; } } if (axis->pushConstants.structSize) { size_t size = axis->pushConstants.structSize; result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); if (result2 != CUDA_SUCCESS) { printf("cuModuleGetGlobal error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; } } if (!app->configuration.saveApplicationToString) { free(code); code = 0; } #elif(VKFFT_BACKEND==2) uint32_t* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); } else { hiprtcProgram prog; enum hiprtcResult result = hiprtcCreateProgram(&prog, // prog code0, // buffer "VkFFT.hip", // name 0, // numHeaders 0, // headers 0); // includeNames if (result != HIPRTC_SUCCESS) { printf("hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } if (axis->pushConstants.structSize) { result = hiprtcAddNameExpression(prog, "&consts"); if (result != HIPRTC_SUCCESS) { printf("hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION; } } result = hiprtcCompileProgram(prog, // prog 0, // numOptions 0); // options if (result != HIPRTC_SUCCESS) { printf("hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result)); char* log = (char*)malloc(sizeof(char) * 100000); if (!log) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } else { hiprtcGetProgramLog(prog, log); printf("%s\n", log); free(log); log = 0; printf("%s\n", code0); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } result = hiprtcGetCodeSize(prog, &codeSize); if (result != HIPRTC_SUCCESS) { printf("hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE; } axis->binarySize = codeSize; code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } axis->binary = code; result = hiprtcGetCode(prog, (char*)code); if (result != HIPRTC_SUCCESS) { printf("hiprtcGetCode error: %s\n", hiprtcGetErrorString(result)); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; } //printf("%s\n", code); // Destroy the program. result = hiprtcDestroyProgram(&prog); if (result != HIPRTC_SUCCESS) { printf("hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result)); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; } } hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); if (result2 != hipSuccess) { printf("hipModuleLoadDataEx error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; } result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main"); if (result2 != hipSuccess) { printf("hipModuleGetFunction error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; } /*result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared); if (result2 != hipSuccess) { printf("hipFuncSetAttribute error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; }*/ if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (int)axis->specializationConstants.usedSharedMemory); //result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared); if (result2 != hipSuccess) { printf("hipFuncSetAttribute error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; } } if (axis->pushConstants.structSize) { size_t size = axis->pushConstants.structSize; result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); if (result2 != hipSuccess) { printf("hipModuleGetGlobal error: %d\n", result2); free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; } } if (!app->configuration.saveApplicationToString) { free(code); code = 0; } #elif(VKFFT_BACKEND==3) if (app->configuration.loadApplicationFromString) { char* code; uint64_t codeSize; char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); size_t codeSize_size_t = (size_t)codeSize; code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); const unsigned char* temp_code = (const unsigned char*)code; axis->program = clCreateProgramWithBinary(app->configuration.context[0], 1, app->configuration.device, &codeSize_size_t, (const unsigned char**)(&temp_code), 0, &res); if (res != CL_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } free(code); code = 0; } else { size_t codelen = strlen(code0); const char* temp_code = (const char*)code0; axis->program = clCreateProgramWithSource(app->configuration.context[0], 1, (const char**)&temp_code, &codelen, &res); if (res != CL_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } } res = clBuildProgram(axis->program, 1, app->configuration.device, 0, 0, 0); if (res != CL_SUCCESS) { size_t log_size; clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size); char* log = (char*)malloc(log_size); if (!log) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } else { clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0); printf("%s\n", log); free(log); log = 0; printf("%s\n", code0); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } if (app->configuration.saveApplicationToString) { size_t codeSize; res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &codeSize, NULL); if (res != CL_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } axis->binarySize = (uint64_t)codeSize; axis->binary = (char*)malloc(axis->binarySize); if (!axis->binary) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARIES, codeSize, &axis->binary, NULL); if (res != CL_SUCCESS) { free(axis->binary); axis->binary = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } axis->kernel = clCreateKernel(axis->program, "VkFFT_main", &res); if (res != CL_SUCCESS) { if (app->configuration.saveApplicationToString) { free(axis->binary); axis->binary = 0; } free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; } #elif(VKFFT_BACKEND==4) uint32_t* code; uint64_t codeSize; if (app->configuration.loadApplicationFromString) { char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); code = (uint32_t*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; ze_module_desc_t moduleDesc = { ZE_STRUCTURE_TYPE_MODULE_DESC, 0, ZE_MODULE_FORMAT_NATIVE, codeSize, (uint8_t*)code, pBuildFlags, 0 }; res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); if (res != ZE_RESULT_SUCCESS) { free(code); code = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } free(code); code = 0; } else { size_t codelen = strlen(code0); uint64_t successOpen = 0; FILE* temp; char fname_cl[100]; char fname_bc[100]; char fname_spv[100]; int name_id = 0; while (!successOpen) { sprintf(fname_cl, "VkFFT_temp_cl_%d.cl", name_id); temp = fopen(fname_cl, "r"); if (temp != 0) { fclose(temp); name_id++; } else { successOpen = 1; sprintf(fname_bc, "VkFFT_temp_bc_%d.spv", name_id); sprintf(fname_spv, "VkFFT_temp_cl_%d.spv", name_id); } } temp = fopen(fname_cl, "w"); fwrite(code0, 1, codelen, temp); fclose(temp); char system_call[500]; sprintf(system_call, "clang -c -target spir64 -O0 -emit-llvm -o %s %s", fname_bc, fname_cl); system(system_call); sprintf(system_call, "llvm-spirv -o %s %s", fname_spv, fname_bc); system(system_call); temp = fopen(fname_spv, "rb"); fseek(temp, 0L, SEEK_END); uint64_t spv_size = ftell(temp); rewind(temp); uint8_t* spv_binary = (uint8_t*)malloc(spv_size); if (!spv_binary) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } fread(spv_binary, 1, spv_size, temp); fclose(temp); remove(fname_cl); remove(fname_bc); remove(fname_spv); const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; ze_module_desc_t moduleDesc = { ZE_STRUCTURE_TYPE_MODULE_DESC, 0, ZE_MODULE_FORMAT_IL_SPIRV, spv_size, spv_binary, pBuildFlags, 0 }; res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); if (res != ZE_RESULT_SUCCESS) { free(spv_binary); spv_binary = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; } free(spv_binary); spv_binary = 0; if (app->configuration.saveApplicationToString) { size_t codeSize; res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, 0); if (res != ZE_RESULT_SUCCESS) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } axis->binarySize = codeSize; axis->binary = (char*)malloc(axis->binarySize); if (!axis->binary) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, (uint8_t*)axis->binary); if (res != ZE_RESULT_SUCCESS) { free(axis->binary); axis->binary = 0; free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } } } ze_kernel_desc_t kernelDesc = { ZE_STRUCTURE_TYPE_KERNEL_DESC, 0, 0, // flags "VkFFT_main" }; res = zeKernelCreate(axis->VkFFTModule, &kernelDesc, &axis->VkFFTKernel); if (res != ZE_RESULT_SUCCESS) { if (app->configuration.saveApplicationToString) { free(axis->binary); axis->binary = 0; } free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; } #elif(VKFFT_BACKEND==5) NS::Error* error; if (app->configuration.loadApplicationFromString) { char* code; uint64_t codeSize; char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); size_t codeSize_size_t = (size_t)codeSize; code = (char*)malloc(codeSize); if (!code) { free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); app->currentApplicationStringPos += codeSize + sizeof(uint64_t); dispatch_data_t data = dispatch_data_create(code, codeSize, 0, 0); axis->library = app->configuration.device->newLibrary(data, &error); if (error)std::cout << error->debugDescription()->cString(NS::ASCIIStringEncoding) << error->localizedDescription()->cString(NS::ASCIIStringEncoding) << std::endl; free(code); code = 0; } else { size_t codelen = strlen(code0); MTL::CompileOptions* compileOptions = MTL::CompileOptions::alloc(); compileOptions->setFastMathEnabled(true); NS::String* str = NS::String::string(code0, NS::UTF8StringEncoding); axis->library = app->configuration.device->newLibrary(str, compileOptions, &error); if (error) { printf("%s\n%s\n", error->debugDescription()->cString(NS::ASCIIStringEncoding), error->localizedDescription()->cString(NS::ASCIIStringEncoding)); free(code0); code0 = 0; deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; } compileOptions->release(); if (app->configuration.saveApplicationToString) { } str->release(); } const char function_name[20] = "VkFFT_main"; NS::String* str = NS::String::string(function_name, NS::UTF8StringEncoding); MTL::Function* function = axis->library->newFunction(str); axis->pipeline = app->configuration.device->newComputePipelineState(function, &error); function->release(); str->release(); #endif if (!app->configuration.keepShaderCode) { free(code0); code0 = 0; axis->specializationConstants.code0 = 0; } } if (axis->specializationConstants.axisSwapped) {//swap back for correct dispatch uint64_t temp = axis->axisBlock[1]; axis->axisBlock[1] = axis->axisBlock[0]; axis->axisBlock[0] = temp; axis->specializationConstants.axisSwapped = 0; } return resFFT; } static inline VkFFTResult initializeBluesteinAutoPadding(VkFFTApplication* app) { VkFFTResult resFFT = VKFFT_SUCCESS; if (!app->configuration.useCustomBluesteinPaddingPattern) { switch (app->configuration.vendorID) { case 0x10DE://NVIDIA if (app->configuration.doublePrecision) { app->configuration.autoCustomBluesteinPaddingPattern = 48; } else { app->configuration.autoCustomBluesteinPaddingPattern = 45; } break; default: //have not done a test run for Intel, so everything else uses AMD profile if (app->configuration.doublePrecision) { app->configuration.autoCustomBluesteinPaddingPattern = 54; } else { app->configuration.autoCustomBluesteinPaddingPattern = 29; } break; } app->configuration.primeSizes = (uint64_t*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(uint64_t)); if (!app->configuration.primeSizes) return VKFFT_ERROR_MALLOC_FAILED; app->configuration.paddedSizes = (uint64_t*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(uint64_t)); if (!app->configuration.paddedSizes) return VKFFT_ERROR_MALLOC_FAILED; switch (app->configuration.vendorID) { case 0x10DE://Nvidia if (app->configuration.doublePrecision) { app->configuration.primeSizes[0] = 17; app->configuration.paddedSizes[0] = 36; app->configuration.primeSizes[1] = 19; app->configuration.paddedSizes[1] = 40; app->configuration.primeSizes[2] = 23; app->configuration.paddedSizes[2] = 48; app->configuration.primeSizes[3] = 29; app->configuration.paddedSizes[3] = 64; app->configuration.primeSizes[4] = 34; app->configuration.paddedSizes[4] = 70; app->configuration.primeSizes[5] = 37; app->configuration.paddedSizes[5] = 80; app->configuration.primeSizes[6] = 41; app->configuration.paddedSizes[6] = 90; app->configuration.primeSizes[7] = 46; app->configuration.paddedSizes[7] = 96; app->configuration.primeSizes[8] = 51; app->configuration.paddedSizes[8] = 104; app->configuration.primeSizes[9] = 53; app->configuration.paddedSizes[9] = 128; app->configuration.primeSizes[10] = 67; app->configuration.paddedSizes[10] = 144; app->configuration.primeSizes[11] = 73; app->configuration.paddedSizes[11] = 160; app->configuration.primeSizes[12] = 82; app->configuration.paddedSizes[12] = 256; app->configuration.primeSizes[13] = 129; app->configuration.paddedSizes[13] = 288; app->configuration.primeSizes[14] = 145; app->configuration.paddedSizes[14] = 512; app->configuration.primeSizes[15] = 257; app->configuration.paddedSizes[15] = 625; app->configuration.primeSizes[16] = 314; app->configuration.paddedSizes[16] = 750; app->configuration.primeSizes[17] = 376; app->configuration.paddedSizes[17] = 756; app->configuration.primeSizes[18] = 379; app->configuration.paddedSizes[18] = 768; app->configuration.primeSizes[19] = 386; app->configuration.paddedSizes[19] = 1024; app->configuration.primeSizes[20] = 513; app->configuration.paddedSizes[20] = 1056; app->configuration.primeSizes[21] = 529; app->configuration.paddedSizes[21] = 1200; app->configuration.primeSizes[22] = 601; app->configuration.paddedSizes[22] = 1225; app->configuration.primeSizes[23] = 614; app->configuration.paddedSizes[23] = 1250; app->configuration.primeSizes[24] = 626; app->configuration.paddedSizes[24] = 1296; app->configuration.primeSizes[25] = 649; app->configuration.paddedSizes[25] = 1331; app->configuration.primeSizes[26] = 667; app->configuration.paddedSizes[26] = 1440; app->configuration.primeSizes[27] = 721; app->configuration.paddedSizes[27] = 1456; app->configuration.primeSizes[28] = 730; app->configuration.paddedSizes[28] = 1560; app->configuration.primeSizes[29] = 781; app->configuration.paddedSizes[29] = 2048; app->configuration.primeSizes[30] = 1025; app->configuration.paddedSizes[30] = 2187; app->configuration.primeSizes[31] = 1095; app->configuration.paddedSizes[31] = 2304; app->configuration.primeSizes[32] = 1153; app->configuration.paddedSizes[32] = 2688; app->configuration.primeSizes[33] = 1345; app->configuration.paddedSizes[33] = 2730; app->configuration.primeSizes[34] = 1366; app->configuration.paddedSizes[34] = 2925; app->configuration.primeSizes[35] = 1464; app->configuration.paddedSizes[35] = 3000; app->configuration.primeSizes[36] = 1501; app->configuration.paddedSizes[36] = 4096; app->configuration.primeSizes[37] = 2049; app->configuration.paddedSizes[37] = 4368; app->configuration.primeSizes[38] = 2185; app->configuration.paddedSizes[38] = 4608; app->configuration.primeSizes[39] = 2305; app->configuration.paddedSizes[39] = 4900; app->configuration.primeSizes[40] = 2364; app->configuration.paddedSizes[40] = 4900; app->configuration.primeSizes[41] = 2451; app->configuration.paddedSizes[41] = 5184; app->configuration.primeSizes[42] = 2593; app->configuration.paddedSizes[42] = 5625; app->configuration.primeSizes[43] = 2814; app->configuration.paddedSizes[43] = 5760; app->configuration.primeSizes[44] = 2881; app->configuration.paddedSizes[44] = 6000; app->configuration.primeSizes[45] = 3001; app->configuration.paddedSizes[45] = 6048; app->configuration.primeSizes[46] = 3026; app->configuration.paddedSizes[46] = 6561; app->configuration.primeSizes[47] = 3282; app->configuration.paddedSizes[47] = 8192; } else { app->configuration.primeSizes[0] = 17; app->configuration.paddedSizes[0] = 36; app->configuration.primeSizes[1] = 19; app->configuration.paddedSizes[1] = 40; app->configuration.primeSizes[2] = 23; app->configuration.paddedSizes[2] = 48; app->configuration.primeSizes[3] = 29; app->configuration.paddedSizes[3] = 64; app->configuration.primeSizes[4] = 34; app->configuration.paddedSizes[4] = 70; app->configuration.primeSizes[5] = 37; app->configuration.paddedSizes[5] = 80; app->configuration.primeSizes[6] = 41; app->configuration.paddedSizes[6] = 96; app->configuration.primeSizes[7] = 51; app->configuration.paddedSizes[7] = 104; app->configuration.primeSizes[8] = 53; app->configuration.paddedSizes[8] = 112; app->configuration.primeSizes[9] = 57; app->configuration.paddedSizes[9] = 120; app->configuration.primeSizes[10] = 61; app->configuration.paddedSizes[10] = 128; app->configuration.primeSizes[11] = 67; app->configuration.paddedSizes[11] = 144; app->configuration.primeSizes[12] = 73; app->configuration.paddedSizes[12] = 150; app->configuration.primeSizes[13] = 76; app->configuration.paddedSizes[13] = 160; app->configuration.primeSizes[14] = 82; app->configuration.paddedSizes[14] = 256; app->configuration.primeSizes[15] = 129; app->configuration.paddedSizes[15] = 384; app->configuration.primeSizes[16] = 193; app->configuration.paddedSizes[16] = 512; app->configuration.primeSizes[17] = 257; app->configuration.paddedSizes[17] = 567; app->configuration.primeSizes[18] = 285; app->configuration.paddedSizes[18] = 625; app->configuration.primeSizes[19] = 314; app->configuration.paddedSizes[19] = 768; app->configuration.primeSizes[20] = 386; app->configuration.paddedSizes[20] = 832; app->configuration.primeSizes[21] = 417; app->configuration.paddedSizes[21] = 1024; app->configuration.primeSizes[22] = 513; app->configuration.paddedSizes[22] = 1152; app->configuration.primeSizes[23] = 577; app->configuration.paddedSizes[23] = 1200; app->configuration.primeSizes[24] = 601; app->configuration.paddedSizes[24] = 1296; app->configuration.primeSizes[25] = 649; app->configuration.paddedSizes[25] = 1536; app->configuration.primeSizes[26] = 769; app->configuration.paddedSizes[26] = 2048; app->configuration.primeSizes[27] = 1025; app->configuration.paddedSizes[27] = 2187; app->configuration.primeSizes[28] = 1095; app->configuration.paddedSizes[28] = 2304; app->configuration.primeSizes[29] = 1153; app->configuration.paddedSizes[29] = 2500; app->configuration.primeSizes[30] = 1251; app->configuration.paddedSizes[30] = 2592; app->configuration.primeSizes[31] = 1297; app->configuration.paddedSizes[31] = 2816; app->configuration.primeSizes[32] = 1409; app->configuration.paddedSizes[32] = 3072; app->configuration.primeSizes[33] = 1537; app->configuration.paddedSizes[33] = 4096; app->configuration.primeSizes[34] = 2049; app->configuration.paddedSizes[34] = 4368; app->configuration.primeSizes[35] = 2185; app->configuration.paddedSizes[35] = 4563; app->configuration.primeSizes[36] = 2283; app->configuration.paddedSizes[36] = 4576; app->configuration.primeSizes[37] = 2289; app->configuration.paddedSizes[37] = 4608; app->configuration.primeSizes[38] = 2305; app->configuration.paddedSizes[38] = 5184; app->configuration.primeSizes[39] = 2593; app->configuration.paddedSizes[39] = 5625; app->configuration.primeSizes[40] = 2814; app->configuration.paddedSizes[40] = 5632; app->configuration.primeSizes[41] = 2817; app->configuration.paddedSizes[41] = 6000; app->configuration.primeSizes[42] = 3001; app->configuration.paddedSizes[42] = 6144; app->configuration.primeSizes[43] = 3073; app->configuration.paddedSizes[43] = 6561; app->configuration.primeSizes[44] = 3282; app->configuration.paddedSizes[44] = 8192; } break; default: //have not done a test run for Intel, so everything else uses AMD profile if (app->configuration.doublePrecision) { app->configuration.primeSizes[0] = 17; app->configuration.paddedSizes[0] = 36; app->configuration.primeSizes[1] = 19; app->configuration.paddedSizes[1] = 40; app->configuration.primeSizes[2] = 23; app->configuration.paddedSizes[2] = 56; app->configuration.primeSizes[3] = 29; app->configuration.paddedSizes[3] = 64; app->configuration.primeSizes[4] = 34; app->configuration.paddedSizes[4] = 70; app->configuration.primeSizes[5] = 37; app->configuration.paddedSizes[5] = 78; app->configuration.primeSizes[6] = 41; app->configuration.paddedSizes[6] = 81; app->configuration.primeSizes[7] = 43; app->configuration.paddedSizes[7] = 90; app->configuration.primeSizes[8] = 46; app->configuration.paddedSizes[8] = 125; app->configuration.primeSizes[9] = 67; app->configuration.paddedSizes[9] = 150; app->configuration.primeSizes[10] = 76; app->configuration.paddedSizes[10] = 175; app->configuration.primeSizes[11] = 89; app->configuration.paddedSizes[11] = 189; app->configuration.primeSizes[12] = 97; app->configuration.paddedSizes[12] = 198; app->configuration.primeSizes[13] = 101; app->configuration.paddedSizes[13] = 243; app->configuration.primeSizes[14] = 123; app->configuration.paddedSizes[14] = 256; app->configuration.primeSizes[15] = 129; app->configuration.paddedSizes[15] = 270; app->configuration.primeSizes[16] = 136; app->configuration.paddedSizes[16] = 512; app->configuration.primeSizes[17] = 257; app->configuration.paddedSizes[17] = 625; app->configuration.primeSizes[18] = 314; app->configuration.paddedSizes[18] = 640; app->configuration.primeSizes[19] = 321; app->configuration.paddedSizes[19] = 702; app->configuration.primeSizes[20] = 353; app->configuration.paddedSizes[20] = 750; app->configuration.primeSizes[21] = 376; app->configuration.paddedSizes[21] = 756; app->configuration.primeSizes[22] = 379; app->configuration.paddedSizes[22] = 768; app->configuration.primeSizes[23] = 386; app->configuration.paddedSizes[23] = 875; app->configuration.primeSizes[24] = 439; app->configuration.paddedSizes[24] = 1024; app->configuration.primeSizes[25] = 513; app->configuration.paddedSizes[25] = 1296; app->configuration.primeSizes[26] = 649; app->configuration.paddedSizes[26] = 1300; app->configuration.primeSizes[27] = 651; app->configuration.paddedSizes[27] = 1323; app->configuration.primeSizes[28] = 663; app->configuration.paddedSizes[28] = 1344; app->configuration.primeSizes[29] = 673; app->configuration.paddedSizes[29] = 1512; app->configuration.primeSizes[30] = 757; app->configuration.paddedSizes[30] = 1792; app->configuration.primeSizes[31] = 897; app->configuration.paddedSizes[31] = 2016; app->configuration.primeSizes[32] = 1009; app->configuration.paddedSizes[32] = 2048; app->configuration.primeSizes[33] = 1025; app->configuration.paddedSizes[33] = 2187; app->configuration.primeSizes[34] = 1095; app->configuration.paddedSizes[34] = 3136; app->configuration.primeSizes[35] = 1569; app->configuration.paddedSizes[35] = 3159; app->configuration.primeSizes[36] = 1581; app->configuration.paddedSizes[36] = 3430; app->configuration.primeSizes[37] = 1717; app->configuration.paddedSizes[37] = 3584; app->configuration.primeSizes[38] = 1793; app->configuration.paddedSizes[38] = 4096; app->configuration.primeSizes[39] = 2049; app->configuration.paddedSizes[39] = 4224; app->configuration.primeSizes[40] = 2113; app->configuration.paddedSizes[40] = 4375; app->configuration.primeSizes[41] = 2189; app->configuration.paddedSizes[41] = 4480; app->configuration.primeSizes[42] = 2241; app->configuration.paddedSizes[42] = 4704; app->configuration.primeSizes[43] = 2353; app->configuration.paddedSizes[43] = 4928; app->configuration.primeSizes[44] = 2465; app->configuration.paddedSizes[44] = 4992; app->configuration.primeSizes[45] = 2497; app->configuration.paddedSizes[45] = 5005; app->configuration.primeSizes[46] = 2504; app->configuration.paddedSizes[46] = 5103; app->configuration.primeSizes[47] = 2553; app->configuration.paddedSizes[47] = 5376; app->configuration.primeSizes[48] = 2689; app->configuration.paddedSizes[48] = 5632; app->configuration.primeSizes[49] = 2817; app->configuration.paddedSizes[49] = 5824; app->configuration.primeSizes[50] = 2913; app->configuration.paddedSizes[50] = 6048; app->configuration.primeSizes[51] = 3026; app->configuration.paddedSizes[51] = 6144; app->configuration.primeSizes[52] = 3073; app->configuration.paddedSizes[52] = 6875; app->configuration.primeSizes[53] = 3439; app->configuration.paddedSizes[53] = 8192; } else { app->configuration.primeSizes[0] = 17; app->configuration.paddedSizes[0] = 36; app->configuration.primeSizes[1] = 19; app->configuration.paddedSizes[1] = 42; app->configuration.primeSizes[2] = 23; app->configuration.paddedSizes[2] = 64; app->configuration.primeSizes[3] = 34; app->configuration.paddedSizes[3] = 81; app->configuration.primeSizes[4] = 43; app->configuration.paddedSizes[4] = 88; app->configuration.primeSizes[5] = 46; app->configuration.paddedSizes[5] = 125; app->configuration.primeSizes[6] = 67; app->configuration.paddedSizes[6] = 150; app->configuration.primeSizes[7] = 76; app->configuration.paddedSizes[7] = 162; app->configuration.primeSizes[8] = 82; app->configuration.paddedSizes[8] = 175; app->configuration.primeSizes[9] = 89; app->configuration.paddedSizes[9] = 256; app->configuration.primeSizes[10] = 129; app->configuration.paddedSizes[10] = 512; app->configuration.primeSizes[11] = 257; app->configuration.paddedSizes[11] = 625; app->configuration.primeSizes[12] = 314; app->configuration.paddedSizes[12] = 768; app->configuration.primeSizes[13] = 386; app->configuration.paddedSizes[13] = 1024; app->configuration.primeSizes[14] = 513; app->configuration.paddedSizes[14] = 1296; app->configuration.primeSizes[15] = 649; app->configuration.paddedSizes[15] = 2048; app->configuration.primeSizes[16] = 1025; app->configuration.paddedSizes[16] = 2187; app->configuration.primeSizes[17] = 1095; app->configuration.paddedSizes[17] = 2304; app->configuration.primeSizes[18] = 1153; app->configuration.paddedSizes[18] = 2500; app->configuration.primeSizes[19] = 1251; app->configuration.paddedSizes[19] = 2592; app->configuration.primeSizes[20] = 1297; app->configuration.paddedSizes[20] = 3072; app->configuration.primeSizes[21] = 1537; app->configuration.paddedSizes[21] = 3125; app->configuration.primeSizes[22] = 1564; app->configuration.paddedSizes[22] = 3136; app->configuration.primeSizes[23] = 1569; app->configuration.paddedSizes[23] = 4096; app->configuration.primeSizes[24] = 2049; app->configuration.paddedSizes[24] = 4375; app->configuration.primeSizes[25] = 2189; app->configuration.paddedSizes[25] = 4608; app->configuration.primeSizes[26] = 2305; app->configuration.paddedSizes[26] = 5184; app->configuration.primeSizes[27] = 2593; app->configuration.paddedSizes[27] = 6561; app->configuration.primeSizes[28] = 3282; app->configuration.paddedSizes[28] = 8192; } break; } } return resFFT; } static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration) { VkFFTResult resFFT = VKFFT_SUCCESS; //app->configuration = {};// inputLaunchConfiguration; if (inputLaunchConfiguration.doublePrecision != 0) app->configuration.doublePrecision = inputLaunchConfiguration.doublePrecision; if (inputLaunchConfiguration.doublePrecisionFloatMemory != 0) app->configuration.doublePrecisionFloatMemory = inputLaunchConfiguration.doublePrecisionFloatMemory; if (inputLaunchConfiguration.halfPrecision != 0) app->configuration.halfPrecision = inputLaunchConfiguration.halfPrecision; if (inputLaunchConfiguration.halfPrecisionMemoryOnly != 0) app->configuration.halfPrecisionMemoryOnly = inputLaunchConfiguration.halfPrecisionMemoryOnly; if (inputLaunchConfiguration.useCustomBluesteinPaddingPattern != 0) { app->configuration.useCustomBluesteinPaddingPattern = inputLaunchConfiguration.useCustomBluesteinPaddingPattern; app->configuration.primeSizes = inputLaunchConfiguration.primeSizes; if (!app->configuration.primeSizes) return VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays; app->configuration.paddedSizes = inputLaunchConfiguration.paddedSizes; if (!app->configuration.paddedSizes) return VKFFT_ERROR_EMPTY_useCustomBluesteinPaddingPattern_arrays; } //set device parameters #if(VKFFT_BACKEND==0) if (!inputLaunchConfiguration.isCompilerInitialized) { if (!app->configuration.isCompilerInitialized) { int resGlslangInitialize = glslang_initialize_process(); if (!resGlslangInitialize) return VKFFT_ERROR_FAILED_TO_INITIALIZE; app->configuration.isCompilerInitialized = 1; } } if (inputLaunchConfiguration.physicalDevice == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_PHYSICAL_DEVICE; } app->configuration.physicalDevice = inputLaunchConfiguration.physicalDevice; if (inputLaunchConfiguration.device == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_DEVICE; } app->configuration.device = inputLaunchConfiguration.device; if (inputLaunchConfiguration.queue == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_QUEUE; } app->configuration.queue = inputLaunchConfiguration.queue; if (inputLaunchConfiguration.commandPool == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_COMMAND_POOL; } app->configuration.commandPool = inputLaunchConfiguration.commandPool; if (inputLaunchConfiguration.fence == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_FENCE; } app->configuration.fence = inputLaunchConfiguration.fence; VkPhysicalDeviceProperties physicalDeviceProperties = { 0 }; vkGetPhysicalDeviceProperties(app->configuration.physicalDevice[0], &physicalDeviceProperties); app->configuration.maxThreadsNum = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations; if (physicalDeviceProperties.vendorID == 0x8086) app->configuration.maxThreadsNum = 256; //Intel fix app->configuration.maxComputeWorkGroupCount[0] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0]; app->configuration.maxComputeWorkGroupCount[1] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1]; app->configuration.maxComputeWorkGroupCount[2] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2]; app->configuration.maxComputeWorkGroupSize[0] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0]; app->configuration.maxComputeWorkGroupSize[1] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1]; app->configuration.maxComputeWorkGroupSize[2] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2]; //if ((physicalDeviceProperties.vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1; app->configuration.sharedMemorySize = physicalDeviceProperties.limits.maxComputeSharedMemorySize; app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(physicalDeviceProperties.limits.maxComputeSharedMemorySize)); app->configuration.vendorID = physicalDeviceProperties.vendorID; if (inputLaunchConfiguration.pipelineCache != 0) app->configuration.pipelineCache = inputLaunchConfiguration.pipelineCache; app->configuration.useRaderUintLUT = 1; switch (physicalDeviceProperties.vendorID) { case 0x10DE://NVIDIA app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM. app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.warpSize = 32; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = 4; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 4194305 : 4194305; break; case 0x8086://INTEL app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; app->configuration.useLUT = 1; app->configuration.warpSize = 32; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 1 : 2; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; break; case 0x1002://AMD app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32; app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.warpSize = 64; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 2 : 4; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; break; default: app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.warpSize = 32; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = 1; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; break; } #elif(VKFFT_BACKEND==1) CUresult res = CUDA_SUCCESS; cudaError_t res_t = cudaSuccess; if (inputLaunchConfiguration.device == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_DEVICE; } app->configuration.device = inputLaunchConfiguration.device; if (inputLaunchConfiguration.num_streams != 0) app->configuration.num_streams = inputLaunchConfiguration.num_streams; if (inputLaunchConfiguration.stream != 0) app->configuration.stream = inputLaunchConfiguration.stream; app->configuration.streamID = 0; int value = 0; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.computeCapabilityMajor = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.computeCapabilityMinor = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxThreadsNum = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupCount[0] = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupCount[1] = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupCount[2] = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupSize[0] = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupSize[1] = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupSize[2] = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.sharedMemorySizeStatic = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.sharedMemorySize = value;// (value > 65536) ? 65536 : value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.warpSize = value; res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO, app->configuration.device[0]); if (res != CUDA_SUCCESS) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.useLUT_4step = (value <= 4) ? -1 : 1; //we don't need this in CUDA app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); app->configuration.useRaderUintLUT = 0; if (app->configuration.num_streams > 1) { app->configuration.stream_event = (cudaEvent_t*)malloc(app->configuration.num_streams * sizeof(cudaEvent_t)); if (!app->configuration.stream_event) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } for (uint64_t i = 0; i < app->configuration.num_streams; i++) { res_t = cudaEventCreate(&app->configuration.stream_event[i]); if (res_t != cudaSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_EVENT; } } } app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM. app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = 1; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 4194305 : 4194305; app->configuration.vendorID = 0x10DE; #elif(VKFFT_BACKEND==2) hipError_t res = hipSuccess; if (inputLaunchConfiguration.device == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_DEVICE; } app->configuration.device = inputLaunchConfiguration.device; if (inputLaunchConfiguration.num_streams != 0) app->configuration.num_streams = inputLaunchConfiguration.num_streams; if (inputLaunchConfiguration.stream != 0) app->configuration.stream = inputLaunchConfiguration.stream; app->configuration.streamID = 0; int value = 0; res = hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMajor, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.computeCapabilityMajor = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMinor, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.computeCapabilityMinor = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxThreadsNum = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupCount[0] = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupCount[1] = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupCount[2] = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimX, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupSize[0] = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimY, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupSize[1] = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimZ, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupSize[2] = value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.sharedMemorySizeStatic = value; //hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlockOptin, app->configuration.device[0]); app->configuration.sharedMemorySize = value;// (value > 65536) ? 65536 : value; res = hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, app->configuration.device[0]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.warpSize = value; app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); app->configuration.useRaderUintLUT = 0; if (app->configuration.num_streams > 1) { app->configuration.stream_event = (hipEvent_t*)malloc(app->configuration.num_streams * sizeof(hipEvent_t)); if (!app->configuration.stream_event) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } for (uint64_t i = 0; i < app->configuration.num_streams; i++) { res = hipEventCreate(&app->configuration.stream_event[i]); if (res != hipSuccess) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_CREATE_EVENT; } } } app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32; app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.useLUT_4step = -1; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = 1; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 1048576 : 2097152; app->configuration.vendorID = 0x1002; #elif(VKFFT_BACKEND==3) cl_int res = 0; if (inputLaunchConfiguration.device == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_DEVICE; } app->configuration.device = inputLaunchConfiguration.device; if (inputLaunchConfiguration.context == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_CONTEXT; } app->configuration.context = inputLaunchConfiguration.context; cl_uint vendorID; size_t value_int64; cl_uint value_cl_uint; res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, 0); if (res != 0) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value_int64, 0); if (res != 0) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxThreadsNum = value_int64; res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &value_cl_uint, 0); if (res != 0) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } size_t* dims = (size_t*)malloc(sizeof(size_t) * value_cl_uint); if (dims) { res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * value_cl_uint, dims, 0); if (res != 0) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.maxComputeWorkGroupSize[0] = dims[0]; app->configuration.maxComputeWorkGroupSize[1] = dims[1]; app->configuration.maxComputeWorkGroupSize[2] = dims[2]; free(dims); dims = 0; } else { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } app->configuration.maxComputeWorkGroupCount[0] = UINT64_MAX; app->configuration.maxComputeWorkGroupCount[1] = UINT64_MAX; app->configuration.maxComputeWorkGroupCount[2] = UINT64_MAX; //if ((vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1; cl_ulong sharedMemorySize; res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &sharedMemorySize, 0); if (res != 0) { deleteVkFFT(app); return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; } app->configuration.sharedMemorySize = sharedMemorySize; app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(sharedMemorySize)); app->configuration.vendorID = vendorID; app->configuration.useRaderUintLUT = 1; switch (vendorID) { case 0x10DE://NVIDIA app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM. app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.warpSize = 32; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = 4; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 4194305 : 4194305; app->configuration.sharedMemorySize -= 0x10;//reserved by system app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); break; case 0x8086://INTEL app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; app->configuration.useLUT = 1; app->configuration.warpSize = 32; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 1 : 2; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; break; case 0x1002://AMD app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32; app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.warpSize = 64; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 2 : 4; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; break; default: app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.warpSize = 32; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = 1; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; break; } #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; if (inputLaunchConfiguration.device == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_DEVICE; } app->configuration.device = inputLaunchConfiguration.device; if (inputLaunchConfiguration.context == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_CONTEXT; } app->configuration.context = inputLaunchConfiguration.context; if (inputLaunchConfiguration.commandQueue == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_QUEUE; } app->configuration.commandQueue = inputLaunchConfiguration.commandQueue; app->configuration.commandQueueID = inputLaunchConfiguration.commandQueueID; ze_device_properties_t device_properties; ze_device_compute_properties_t compute_properties; res = zeDeviceGetProperties(app->configuration.device[0], &device_properties); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; res = zeDeviceGetComputeProperties(app->configuration.device[0], &compute_properties); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; uint32_t vendorID = device_properties.vendorId; app->configuration.maxThreadsNum = compute_properties.maxTotalGroupSize; app->configuration.maxComputeWorkGroupSize[0] = compute_properties.maxGroupSizeX; app->configuration.maxComputeWorkGroupSize[1] = compute_properties.maxGroupSizeY; app->configuration.maxComputeWorkGroupSize[2] = compute_properties.maxGroupSizeZ; app->configuration.maxComputeWorkGroupCount[0] = compute_properties.maxGroupCountX; app->configuration.maxComputeWorkGroupCount[1] = compute_properties.maxGroupCountY; app->configuration.maxComputeWorkGroupCount[2] = compute_properties.maxGroupCountZ; //if ((vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1; app->configuration.sharedMemorySize = compute_properties.maxSharedLocalMemory; app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; app->configuration.useLUT = 1; app->configuration.warpSize = device_properties.physicalEUSimdWidth; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = (app->configuration.sharedMemorySize >= 65536) ? 1 : 2; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; app->configuration.vendorID = 0x8086; app->configuration.useRaderUintLUT = 1; #elif(VKFFT_BACKEND==5) if (inputLaunchConfiguration.device == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_DEVICE; } app->configuration.device = inputLaunchConfiguration.device; if (inputLaunchConfiguration.queue == 0) { deleteVkFFT(app); return VKFFT_ERROR_INVALID_QUEUE; } app->configuration.queue = inputLaunchConfiguration.queue; const char dummy_kernel[50] = "kernel void VkFFT_dummy (){}"; const char function_name[20] = "VkFFT_dummy"; NS::Error* error; MTL::CompileOptions* compileOptions = MTL::CompileOptions::alloc(); NS::String* str_code = NS::String::string(dummy_kernel, NS::UTF8StringEncoding); MTL::Library* dummy_library = app->configuration.device->newLibrary(str_code, compileOptions, &error); NS::String* str_name = NS::String::string(function_name, NS::UTF8StringEncoding); MTL::Function* function = dummy_library->newFunction(str_name); MTL::ComputePipelineState* dummy_state = app->configuration.device->newComputePipelineState(function, &error); MTL::Size size = app->configuration.device->maxThreadsPerThreadgroup(); app->configuration.maxThreadsNum = dummy_state->maxTotalThreadsPerThreadgroup(); app->configuration.maxComputeWorkGroupSize[0] = size.width; app->configuration.maxComputeWorkGroupSize[1] = size.height; app->configuration.maxComputeWorkGroupSize[2] = size.depth; if (app->configuration.maxThreadsNum > 256) { app->configuration.maxThreadsNum = 256; app->configuration.maxComputeWorkGroupSize[0] = 256; app->configuration.maxComputeWorkGroupSize[1] = 256; app->configuration.maxComputeWorkGroupSize[2] = 256; //The dummy kernel approach (above) does not work for some DCT-IV kernels (like 256x256x256). They refuse to have more than 256 threads. I will just force OpenCL thread limits for now. } app->configuration.maxComputeWorkGroupCount[0] = -1; app->configuration.maxComputeWorkGroupCount[1] = -1; app->configuration.maxComputeWorkGroupCount[2] = -1; app->configuration.sharedMemorySizeStatic = app->configuration.device->maxThreadgroupMemoryLength(); app->configuration.sharedMemorySize = app->configuration.device->maxThreadgroupMemoryLength(); app->configuration.warpSize = dummy_state->threadExecutionWidth(); app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); app->configuration.useRaderUintLUT = 1; app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64;//the coalesced memory is equal to 64 bytes between L2 and VRAM. app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : -1; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost = 1; app->configuration.registerBoost4Step = 1; app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 262144 : 524288; app->configuration.vendorID = 0x1027f00; dummy_state->release(); function->release(); str_name->release(); dummy_library->release(); str_code->release(); compileOptions->release(); #endif resFFT = initializeBluesteinAutoPadding(app); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } //set main parameters: if (inputLaunchConfiguration.FFTdim == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_FFTdim; } app->configuration.FFTdim = inputLaunchConfiguration.FFTdim; if (inputLaunchConfiguration.size[0] == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_size; } app->configuration.size[0] = inputLaunchConfiguration.size[0]; if (inputLaunchConfiguration.bufferStride[0] == 0) { if (inputLaunchConfiguration.performR2C) app->configuration.bufferStride[0] = app->configuration.size[0] / 2 + 1; else app->configuration.bufferStride[0] = app->configuration.size[0]; } else app->configuration.bufferStride[0] = inputLaunchConfiguration.bufferStride[0]; if (inputLaunchConfiguration.inputBufferStride[0] == 0) { if (inputLaunchConfiguration.performR2C) app->configuration.inputBufferStride[0] = app->configuration.size[0] + 2; else app->configuration.inputBufferStride[0] = app->configuration.size[0]; } else app->configuration.inputBufferStride[0] = inputLaunchConfiguration.inputBufferStride[0]; if (inputLaunchConfiguration.outputBufferStride[0] == 0) { if (inputLaunchConfiguration.performR2C) app->configuration.outputBufferStride[0] = app->configuration.size[0] + 2; else app->configuration.outputBufferStride[0] = app->configuration.size[0]; } else app->configuration.outputBufferStride[0] = inputLaunchConfiguration.outputBufferStride[0]; for (uint64_t i = 1; i < 3; i++) { if (inputLaunchConfiguration.size[i] == 0) app->configuration.size[i] = 1; else app->configuration.size[i] = inputLaunchConfiguration.size[i]; if (inputLaunchConfiguration.bufferStride[i] == 0) app->configuration.bufferStride[i] = app->configuration.bufferStride[i - 1] * app->configuration.size[i]; else app->configuration.bufferStride[i] = inputLaunchConfiguration.bufferStride[i]; if (inputLaunchConfiguration.inputBufferStride[i] == 0) app->configuration.inputBufferStride[i] = app->configuration.inputBufferStride[i - 1] * app->configuration.size[i]; else app->configuration.inputBufferStride[i] = inputLaunchConfiguration.inputBufferStride[i]; if (inputLaunchConfiguration.outputBufferStride[i] == 0) app->configuration.outputBufferStride[i] = app->configuration.outputBufferStride[i - 1] * app->configuration.size[i]; else app->configuration.outputBufferStride[i] = inputLaunchConfiguration.outputBufferStride[i]; } app->configuration.isInputFormatted = inputLaunchConfiguration.isInputFormatted; app->configuration.isOutputFormatted = inputLaunchConfiguration.isOutputFormatted; app->configuration.performConvolution = inputLaunchConfiguration.performConvolution; if (inputLaunchConfiguration.bufferNum == 0) app->configuration.bufferNum = 1; else app->configuration.bufferNum = inputLaunchConfiguration.bufferNum; #if(VKFFT_BACKEND==0) if (inputLaunchConfiguration.bufferSize == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_bufferSize; } #endif app->configuration.bufferSize = inputLaunchConfiguration.bufferSize; if (app->configuration.bufferSize != 0) { for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { if (app->configuration.bufferSize[i] == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_bufferSize; } } } app->configuration.buffer = inputLaunchConfiguration.buffer; if (inputLaunchConfiguration.userTempBuffer != 0) app->configuration.userTempBuffer = inputLaunchConfiguration.userTempBuffer; if (app->configuration.userTempBuffer != 0) { if (inputLaunchConfiguration.tempBufferNum == 0) app->configuration.tempBufferNum = 1; else app->configuration.tempBufferNum = inputLaunchConfiguration.tempBufferNum; #if(VKFFT_BACKEND==0) if (inputLaunchConfiguration.tempBufferSize == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_tempBufferSize; } #endif app->configuration.tempBufferSize = inputLaunchConfiguration.tempBufferSize; if (app->configuration.tempBufferSize != 0) { for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) { if (app->configuration.tempBufferSize[i] == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_tempBufferSize; } } } app->configuration.tempBuffer = inputLaunchConfiguration.tempBuffer; } else { app->configuration.tempBufferNum = 1; app->configuration.tempBufferSize = (uint64_t*)malloc(sizeof(uint64_t)); if (!app->configuration.tempBufferSize) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } app->configuration.tempBufferSize[0] = 0; } if (app->configuration.isInputFormatted) { if (inputLaunchConfiguration.inputBufferNum == 0) app->configuration.inputBufferNum = 1; else app->configuration.inputBufferNum = inputLaunchConfiguration.inputBufferNum; #if(VKFFT_BACKEND==0) if (inputLaunchConfiguration.inputBufferSize == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_inputBufferSize; } #endif app->configuration.inputBufferSize = inputLaunchConfiguration.inputBufferSize; if (app->configuration.inputBufferSize != 0) { for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { if (app->configuration.inputBufferSize[i] == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_inputBufferSize; } } } app->configuration.inputBuffer = inputLaunchConfiguration.inputBuffer; } else { app->configuration.inputBufferNum = app->configuration.bufferNum; app->configuration.inputBufferSize = app->configuration.bufferSize; app->configuration.inputBuffer = app->configuration.buffer; } if (app->configuration.isOutputFormatted) { if (inputLaunchConfiguration.outputBufferNum == 0) app->configuration.outputBufferNum = 1; else app->configuration.outputBufferNum = inputLaunchConfiguration.outputBufferNum; #if(VKFFT_BACKEND==0) if (inputLaunchConfiguration.outputBufferSize == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_outputBufferSize; } #endif app->configuration.outputBufferSize = inputLaunchConfiguration.outputBufferSize; if (app->configuration.outputBufferSize != 0) { for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { if (app->configuration.outputBufferSize[i] == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_outputBufferSize; } } } app->configuration.outputBuffer = inputLaunchConfiguration.outputBuffer; } else { app->configuration.outputBufferNum = app->configuration.bufferNum; app->configuration.outputBufferSize = app->configuration.bufferSize; app->configuration.outputBuffer = app->configuration.buffer; } if (app->configuration.performConvolution) { if (inputLaunchConfiguration.kernelNum == 0) app->configuration.kernelNum = 1; else app->configuration.kernelNum = inputLaunchConfiguration.kernelNum; #if(VKFFT_BACKEND==0) if (inputLaunchConfiguration.kernelSize == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_kernelSize; } #endif app->configuration.kernelSize = inputLaunchConfiguration.kernelSize; if (app->configuration.kernelSize != 0) { for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { if (app->configuration.kernelSize[i] == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_kernelSize; } } } app->configuration.kernel = inputLaunchConfiguration.kernel; } if (inputLaunchConfiguration.bufferOffset != 0) app->configuration.bufferOffset = inputLaunchConfiguration.bufferOffset; if (inputLaunchConfiguration.tempBufferOffset != 0) app->configuration.tempBufferOffset = inputLaunchConfiguration.tempBufferOffset; if (inputLaunchConfiguration.inputBufferOffset != 0) app->configuration.inputBufferOffset = inputLaunchConfiguration.inputBufferOffset; if (inputLaunchConfiguration.outputBufferOffset != 0) app->configuration.outputBufferOffset = inputLaunchConfiguration.outputBufferOffset; if (inputLaunchConfiguration.kernelOffset != 0) app->configuration.kernelOffset = inputLaunchConfiguration.kernelOffset; if (inputLaunchConfiguration.specifyOffsetsAtLaunch != 0) app->configuration.specifyOffsetsAtLaunch = inputLaunchConfiguration.specifyOffsetsAtLaunch; //set optional parameters: uint64_t checkBufferSizeFor64BitAddressing = 0; for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { if (app->configuration.bufferSize) checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i]; else { checkBufferSizeFor64BitAddressing = app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2] * 8; if (app->configuration.coordinateFeatures > 0) checkBufferSizeFor64BitAddressing *= app->configuration.coordinateFeatures; if (app->configuration.numberBatches > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberBatches; if (app->configuration.numberKernels > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberKernels; if (app->configuration.doublePrecision) checkBufferSizeFor64BitAddressing *= 2; } } #if(VKFFT_BACKEND==2) app->configuration.useStrict32BitAddress = 0; if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)32)) app->configuration.useStrict32BitAddress = -1; #endif if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; checkBufferSizeFor64BitAddressing = 0; for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { if (app->configuration.inputBufferSize) checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i]; } #if(VKFFT_BACKEND==2) if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)32)) app->configuration.useStrict32BitAddress = -1; #endif if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; checkBufferSizeFor64BitAddressing = 0; for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { if (app->configuration.outputBufferSize) checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i]; } if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; checkBufferSizeFor64BitAddressing = 0; for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { if (app->configuration.kernelSize) checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i]; } #if(VKFFT_BACKEND==2) if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)32)) app->configuration.useStrict32BitAddress = -1; // No reason was found to disable strict 32 bit addressing, so enable it if (app->configuration.useStrict32BitAddress == 0) app->configuration.useStrict32BitAddress = 1; #endif if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; if (inputLaunchConfiguration.useUint64 != 0) app->configuration.useUint64 = inputLaunchConfiguration.useUint64; #if(VKFFT_BACKEND==2) if (inputLaunchConfiguration.useStrict32BitAddress != 0) app->configuration.useStrict32BitAddress = inputLaunchConfiguration.useStrict32BitAddress; #endif if (inputLaunchConfiguration.coalescedMemory != 0) app->configuration.coalescedMemory = inputLaunchConfiguration.coalescedMemory; app->configuration.aimThreads = 128; if (inputLaunchConfiguration.aimThreads != 0) app->configuration.aimThreads = inputLaunchConfiguration.aimThreads; app->configuration.numSharedBanks = 32; if (inputLaunchConfiguration.numSharedBanks != 0) app->configuration.numSharedBanks = inputLaunchConfiguration.numSharedBanks; if (inputLaunchConfiguration.inverseReturnToInputBuffer != 0) app->configuration.inverseReturnToInputBuffer = inputLaunchConfiguration.inverseReturnToInputBuffer; if (inputLaunchConfiguration.useLUT != 0) app->configuration.useLUT = inputLaunchConfiguration.useLUT; if (inputLaunchConfiguration.useLUT_4step != 0) { if (inputLaunchConfiguration.useLUT_4step > 0) app->configuration.useLUT = 1; app->configuration.useLUT_4step = inputLaunchConfiguration.useLUT_4step; } else { if (app->configuration.useLUT_4step == 0) app->configuration.useLUT_4step = app->configuration.useLUT; } if (app->configuration.useLUT == -1) app->configuration.useLUT_4step = -1; if (inputLaunchConfiguration.fixMaxRadixBluestein != 0) app->configuration.fixMaxRadixBluestein = inputLaunchConfiguration.fixMaxRadixBluestein; if (inputLaunchConfiguration.forceBluesteinSequenceSize != 0) app->configuration.forceBluesteinSequenceSize = inputLaunchConfiguration.forceBluesteinSequenceSize; app->configuration.fixMinRaderPrimeMult = 17; switch (app->configuration.vendorID) { case 0x10DE://NVIDIA app->configuration.fixMaxRaderPrimeMult = 89; break; case 0x1002://AMD profile app->configuration.fixMaxRaderPrimeMult = 89; break; default: app->configuration.fixMaxRaderPrimeMult = 17; break; } if (inputLaunchConfiguration.fixMinRaderPrimeMult != 0) app->configuration.fixMinRaderPrimeMult = inputLaunchConfiguration.fixMinRaderPrimeMult; if (inputLaunchConfiguration.fixMaxRaderPrimeMult != 0) app->configuration.fixMaxRaderPrimeMult = inputLaunchConfiguration.fixMaxRaderPrimeMult; switch (app->configuration.vendorID) { case 0x1002://AMD profile if (app->configuration.doublePrecision) app->configuration.fixMinRaderPrimeFFT = 29; else app->configuration.fixMinRaderPrimeFFT = 17; break; default: app->configuration.fixMinRaderPrimeFFT = 17; break; } app->configuration.fixMaxRaderPrimeFFT = 16384; if (inputLaunchConfiguration.fixMinRaderPrimeFFT != 0) app->configuration.fixMinRaderPrimeFFT = inputLaunchConfiguration.fixMinRaderPrimeFFT; if (inputLaunchConfiguration.fixMaxRaderPrimeFFT != 0) app->configuration.fixMaxRaderPrimeFFT = inputLaunchConfiguration.fixMaxRaderPrimeFFT; if (inputLaunchConfiguration.performR2C != 0) { app->configuration.performR2C = inputLaunchConfiguration.performR2C; } if (inputLaunchConfiguration.performDCT != 0) { app->configuration.performDCT = inputLaunchConfiguration.performDCT; } if (inputLaunchConfiguration.disableMergeSequencesR2C != 0) { app->configuration.disableMergeSequencesR2C = inputLaunchConfiguration.disableMergeSequencesR2C; } app->configuration.normalize = 0; if (inputLaunchConfiguration.normalize != 0) app->configuration.normalize = inputLaunchConfiguration.normalize; if (inputLaunchConfiguration.makeForwardPlanOnly != 0) app->configuration.makeForwardPlanOnly = inputLaunchConfiguration.makeForwardPlanOnly; if (inputLaunchConfiguration.makeInversePlanOnly != 0) app->configuration.makeInversePlanOnly = inputLaunchConfiguration.makeInversePlanOnly; app->configuration.reorderFourStep = 1; if (inputLaunchConfiguration.disableReorderFourStep != 0) { app->configuration.reorderFourStep = 0; if (app->configuration.swapTo3Stage4Step < 1048576) app->configuration.swapTo3Stage4Step = 1048576; } if (inputLaunchConfiguration.frequencyZeroPadding != 0) app->configuration.frequencyZeroPadding = inputLaunchConfiguration.frequencyZeroPadding; for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (inputLaunchConfiguration.performZeropadding[i] != 0) { app->configuration.performZeropadding[i] = inputLaunchConfiguration.performZeropadding[i]; app->configuration.fft_zeropad_left[i] = inputLaunchConfiguration.fft_zeropad_left[i]; app->configuration.fft_zeropad_right[i] = inputLaunchConfiguration.fft_zeropad_right[i]; } } if (inputLaunchConfiguration.registerBoost != 0) app->configuration.registerBoost = inputLaunchConfiguration.registerBoost; if (inputLaunchConfiguration.registerBoostNonPow2 != 0) app->configuration.registerBoostNonPow2 = inputLaunchConfiguration.registerBoostNonPow2; if (inputLaunchConfiguration.registerBoost4Step != 0) app->configuration.registerBoost4Step = inputLaunchConfiguration.registerBoost4Step; if (app->configuration.performR2C != 0) { app->configuration.registerBoost = 1; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost4Step = 1; } app->configuration.coordinateFeatures = 1; app->configuration.numberBatches = 1; if (inputLaunchConfiguration.coordinateFeatures != 0) app->configuration.coordinateFeatures = inputLaunchConfiguration.coordinateFeatures; if (inputLaunchConfiguration.numberBatches != 0) app->configuration.numberBatches = inputLaunchConfiguration.numberBatches; app->configuration.matrixConvolution = 1; app->configuration.numberKernels = 1; if (inputLaunchConfiguration.kernelConvolution != 0) { app->configuration.kernelConvolution = inputLaunchConfiguration.kernelConvolution; app->configuration.reorderFourStep = 0; app->configuration.registerBoost = 1; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost4Step = 1; } if (app->configuration.performConvolution) { if (inputLaunchConfiguration.matrixConvolution != 0) app->configuration.matrixConvolution = inputLaunchConfiguration.matrixConvolution; if (inputLaunchConfiguration.numberKernels != 0) app->configuration.numberKernels = inputLaunchConfiguration.numberKernels; if (inputLaunchConfiguration.symmetricKernel != 0) app->configuration.symmetricKernel = inputLaunchConfiguration.symmetricKernel; if (inputLaunchConfiguration.conjugateConvolution != 0) app->configuration.conjugateConvolution = inputLaunchConfiguration.conjugateConvolution; if (inputLaunchConfiguration.crossPowerSpectrumNormalization != 0) app->configuration.crossPowerSpectrumNormalization = inputLaunchConfiguration.crossPowerSpectrumNormalization; app->configuration.reorderFourStep = 0; app->configuration.registerBoost = 1; app->configuration.registerBoostNonPow2 = 0; app->configuration.registerBoost4Step = 1; if (app->configuration.matrixConvolution > 1) app->configuration.coordinateFeatures = app->configuration.matrixConvolution; } app->firstAxis = 0; app->lastAxis = app->configuration.FFTdim - 1; if (inputLaunchConfiguration.omitDimension[0] != 0) { app->configuration.omitDimension[0] = inputLaunchConfiguration.omitDimension[0]; app->firstAxis++; if (app->configuration.performConvolution) { deleteVkFFT(app); return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; } if (app->configuration.performR2C) { deleteVkFFT(app); return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; } } if (inputLaunchConfiguration.omitDimension[2] != 0) { app->configuration.omitDimension[2] = inputLaunchConfiguration.omitDimension[2]; app->lastAxis--; if (app->configuration.performConvolution) { deleteVkFFT(app); return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; } } if (inputLaunchConfiguration.omitDimension[1] != 0) { app->configuration.omitDimension[1] = inputLaunchConfiguration.omitDimension[1]; if (app->configuration.omitDimension[0] == 1) app->firstAxis++; if (app->configuration.omitDimension[2] == 1) app->lastAxis--; if (app->configuration.performConvolution) { deleteVkFFT(app); return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; } } if (app->firstAxis > app->lastAxis) { deleteVkFFT(app); return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; } if (inputLaunchConfiguration.reorderFourStep != 0) app->configuration.reorderFourStep = inputLaunchConfiguration.reorderFourStep; app->configuration.maxCodeLength = 4000000; if (inputLaunchConfiguration.maxCodeLength != 0) app->configuration.maxCodeLength = inputLaunchConfiguration.maxCodeLength; app->configuration.maxTempLength = 5000; if (inputLaunchConfiguration.maxTempLength != 0) app->configuration.maxTempLength = inputLaunchConfiguration.maxTempLength; if (inputLaunchConfiguration.useRaderUintLUT != 0) app->configuration.useRaderUintLUT = inputLaunchConfiguration.useRaderUintLUT; if (inputLaunchConfiguration.halfThreads != 0) app->configuration.halfThreads = inputLaunchConfiguration.halfThreads; if (inputLaunchConfiguration.swapTo3Stage4Step != 0) app->configuration.swapTo3Stage4Step = inputLaunchConfiguration.swapTo3Stage4Step; if (app->configuration.performDCT > 0) app->configuration.performBandwidthBoost = -1; if (inputLaunchConfiguration.performBandwidthBoost != 0) app->configuration.performBandwidthBoost = inputLaunchConfiguration.performBandwidthBoost; if (inputLaunchConfiguration.devicePageSize != 0) app->configuration.devicePageSize = inputLaunchConfiguration.devicePageSize; if (inputLaunchConfiguration.localPageSize != 0) app->configuration.localPageSize = inputLaunchConfiguration.localPageSize; if (inputLaunchConfiguration.keepShaderCode != 0) app->configuration.keepShaderCode = inputLaunchConfiguration.keepShaderCode; if (inputLaunchConfiguration.printMemoryLayout != 0) app->configuration.printMemoryLayout = inputLaunchConfiguration.printMemoryLayout; if (inputLaunchConfiguration.considerAllAxesStrided != 0) app->configuration.considerAllAxesStrided = inputLaunchConfiguration.considerAllAxesStrided; #if(VKFFT_BACKEND!=5) if (inputLaunchConfiguration.loadApplicationString != 0) app->configuration.loadApplicationString = inputLaunchConfiguration.loadApplicationString; if (inputLaunchConfiguration.saveApplicationToString != 0) app->configuration.saveApplicationToString = inputLaunchConfiguration.saveApplicationToString; #endif if (inputLaunchConfiguration.disableSetLocale != 0) app->configuration.disableSetLocale = inputLaunchConfiguration.disableSetLocale; if (inputLaunchConfiguration.loadApplicationFromString != 0) { app->configuration.loadApplicationFromString = inputLaunchConfiguration.loadApplicationFromString; if (app->configuration.saveApplicationToString != 0) { deleteVkFFT(app); return VKFFT_ERROR_ENABLED_saveApplicationToString; } if (app->configuration.loadApplicationString == 0) { deleteVkFFT(app); return VKFFT_ERROR_EMPTY_applicationString; } memcpy(&app->applicationStringSize, app->configuration.loadApplicationString, sizeof(uint64_t)); memcpy(&app->applicationStringOffsetRader, (char*)app->configuration.loadApplicationString + 2 * sizeof(uint64_t), sizeof(uint64_t)); app->currentApplicationStringPos = 5 * sizeof(uint64_t); } //temporary set: app->configuration.registerBoost4Step = 1; #if(VKFFT_BACKEND==0) app->configuration.useUint64 = 0; //No physical addressing mode in Vulkan shaders. Use multiple-buffer support to achieve emulation of physical addressing. #endif //uint64_t initSharedMemory = app->configuration.sharedMemorySize; if (!app->configuration.makeForwardPlanOnly) { app->localFFTPlan_inverse = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan)); if (app->localFFTPlan_inverse) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; resFFT = VkFFTScheduler(app, app->localFFTPlan_inverse, i); if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH) { //try again with Rader disabled - sequences like 89^4 can still be done with Bluestein FFT uint64_t temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT; app->configuration.fixMaxRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT; uint64_t temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult; app->configuration.fixMaxRaderPrimeMult = app->configuration.fixMinRaderPrimeMult; resFFT = VkFFTScheduler(app, app->localFFTPlan_inverse, i); app->configuration.fixMaxRaderPrimeFFT = temp_fixMaxRaderPrimeFFT; app->configuration.fixMaxRaderPrimeMult = temp_fixMaxRaderPrimeMult; } if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { app->localFFTPlan_inverse->inverseBluesteinAxes[i][j] = app->localFFTPlan_inverse->axes[i][j]; } } } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 0); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } } if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 1); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } } } if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) { resFFT = VkFFTPlanR2CMultiUploadDecomposition(app, app->localFFTPlan_inverse, 1); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } } } } else { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } } if (!app->configuration.makeInversePlanOnly) { app->localFFTPlan = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan)); if (app->localFFTPlan) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; resFFT = VkFFTScheduler(app, app->localFFTPlan, i); if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH) { //try again with Rader disabled - sequences like 89^4 can still be done with Bluestein FFT uint64_t temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT; app->configuration.fixMaxRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT; uint64_t temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult; app->configuration.fixMaxRaderPrimeMult = app->configuration.fixMinRaderPrimeMult; resFFT = VkFFTScheduler(app, app->localFFTPlan, i); app->configuration.fixMaxRaderPrimeFFT = temp_fixMaxRaderPrimeFFT; app->configuration.fixMaxRaderPrimeMult = temp_fixMaxRaderPrimeMult; } if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { app->localFFTPlan->inverseBluesteinAxes[i][j] = app->localFFTPlan->axes[i][j]; } } } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 0); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } } if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) { resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 1); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } } } if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) { resFFT = VkFFTPlanR2CMultiUploadDecomposition(app, app->localFFTPlan, 0); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } } } } else { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->useBluesteinFFT[i]) { if (!app->configuration.makeInversePlanOnly) resFFT = VkFFTGeneratePhaseVectors(app, app->localFFTPlan, i); else resFFT = VkFFTGeneratePhaseVectors(app, app->localFFTPlan_inverse, i); if (resFFT != VKFFT_SUCCESS) { deleteVkFFT(app); return resFFT; } } } if (inputLaunchConfiguration.saveApplicationToString != 0) { uint64_t totalBinarySize = 5 * sizeof(uint64_t); if (!app->configuration.makeForwardPlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { totalBinarySize += app->localFFTPlan_inverse->axes[i][j].binarySize + sizeof(uint64_t); } if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { totalBinarySize += app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize + sizeof(uint64_t); } } if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) { totalBinarySize += app->localFFTPlan_inverse->R2Cdecomposition.binarySize + sizeof(uint64_t); } } } if (!app->configuration.makeInversePlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { totalBinarySize += app->localFFTPlan->axes[i][j].binarySize + sizeof(uint64_t); } if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) { totalBinarySize += app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize + sizeof(uint64_t); } } if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) { totalBinarySize += app->localFFTPlan->R2Cdecomposition.binarySize + sizeof(uint64_t); } } } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->useBluesteinFFT[i]) { totalBinarySize += app->applicationBluesteinStringSize[i]; } } if (app->numRaderFFTPrimes > 0) { app->applicationStringOffsetRader = totalBinarySize; for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { totalBinarySize += app->rader_buffer_size[i]; } } app->saveApplicationString = calloc(totalBinarySize, 1); if (!app->saveApplicationString) { deleteVkFFT(app); return VKFFT_ERROR_MALLOC_FAILED; } app->applicationStringSize = totalBinarySize; char* localApplicationStringCast = (char*)app->saveApplicationString; memcpy(localApplicationStringCast, &totalBinarySize, sizeof(uint64_t)); memcpy(localApplicationStringCast + 2, &app->applicationStringOffsetRader, sizeof(uint64_t)); uint64_t currentPos = 5 * sizeof(uint64_t); if (!app->configuration.makeForwardPlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->axes[i][j].binarySize, sizeof(uint64_t)); currentPos += sizeof(uint64_t); memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->axes[i][j].binary, app->localFFTPlan_inverse->axes[i][j].binarySize); currentPos += app->localFFTPlan_inverse->axes[i][j].binarySize; } if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize, sizeof(uint64_t)); currentPos += sizeof(uint64_t); memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binary, app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize); currentPos += app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize; } } if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) { memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->R2Cdecomposition.binarySize, sizeof(uint64_t)); currentPos += sizeof(uint64_t); memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->R2Cdecomposition.binary, app->localFFTPlan_inverse->R2Cdecomposition.binarySize); currentPos += app->localFFTPlan_inverse->R2Cdecomposition.binarySize; } } } if (!app->configuration.makeInversePlanOnly) { for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->axes[i][j].binarySize, sizeof(uint64_t)); currentPos += sizeof(uint64_t); memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->axes[i][j].binary, app->localFFTPlan->axes[i][j].binarySize); currentPos += app->localFFTPlan->axes[i][j].binarySize; } if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) { memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize, sizeof(uint64_t)); currentPos += sizeof(uint64_t); memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->inverseBluesteinAxes[i][j].binary, app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize); currentPos += app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize; } } if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) { memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->R2Cdecomposition.binarySize, sizeof(uint64_t)); currentPos += sizeof(uint64_t); memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->R2Cdecomposition.binary, app->localFFTPlan->R2Cdecomposition.binarySize); currentPos += app->localFFTPlan->R2Cdecomposition.binarySize; } } } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->useBluesteinFFT[i]) { memcpy(localApplicationStringCast + currentPos, app->applicationBluesteinString[i], app->applicationBluesteinStringSize[i]); currentPos += app->applicationBluesteinStringSize[i]; } } if (app->numRaderFFTPrimes > 0) { for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { memcpy(localApplicationStringCast + currentPos, app->raderFFTkernel[i], app->rader_buffer_size[i]); currentPos += app->rader_buffer_size[i]; } } for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { if (app->applicationBluesteinString[i] != 0) { free(app->applicationBluesteinString[i]); app->applicationBluesteinString[i] = 0; } } } #if(VKFFT_BACKEND==0) if (app->configuration.isCompilerInitialized) { glslang_finalize_process(); app->configuration.isCompilerInitialized = 0; } #endif return resFFT; } static inline VkFFTResult dispatchEnhanced(VkFFTApplication* app, VkFFTAxis* axis, uint64_t* dispatchBlock) { VkFFTResult resFFT = VKFFT_SUCCESS; if (axis->specializationConstants.swapComputeWorkGroupID == 1) { uint64_t temp = dispatchBlock[0]; dispatchBlock[0] = dispatchBlock[1]; dispatchBlock[1] = temp; } if (axis->specializationConstants.swapComputeWorkGroupID == 2) { uint64_t temp = dispatchBlock[0]; dispatchBlock[0] = dispatchBlock[2]; dispatchBlock[2] = temp; } uint64_t blockNumber[3] = { (uint64_t)ceil(dispatchBlock[0] / (double)app->configuration.maxComputeWorkGroupCount[0]),(uint64_t)ceil(dispatchBlock[1] / (double)app->configuration.maxComputeWorkGroupCount[1]),(uint64_t)ceil(dispatchBlock[2] / (double)app->configuration.maxComputeWorkGroupCount[2]) }; uint64_t blockSize[3] = { (uint64_t)ceil(dispatchBlock[0] / (double)blockNumber[0]), (uint64_t)ceil(dispatchBlock[1] / (double)blockNumber[1]), (uint64_t)ceil(dispatchBlock[2] / (double)blockNumber[2]) }; uint64_t lastBlockSize[3] = { blockSize[0],blockSize[1],blockSize[2] }; uint64_t dispatchSize[3] = { 1,1,1 }; if (blockNumber[0] == 0) blockNumber[0] = 1; if (blockNumber[1] == 0) blockNumber[1] = 1; if (blockNumber[2] == 0) blockNumber[2] = 1; if ((blockNumber[0] > 1) && (blockNumber[0] * blockSize[0] != dispatchBlock[0])) { lastBlockSize[0] = dispatchBlock[0] % blockSize[0]; } if ((blockNumber[1] > 1) && (blockNumber[1] * blockSize[1] != dispatchBlock[1])) { lastBlockSize[1] = dispatchBlock[1] % blockSize[1]; } if ((blockNumber[2] > 1) && (blockNumber[2] * blockSize[2] != dispatchBlock[2])) { lastBlockSize[2] = dispatchBlock[2] % blockSize[2]; } if (app->configuration.specifyOffsetsAtLaunch) { axis->updatePushConstants = 1; } //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", dispatchBlock[0], dispatchBlock[1], dispatchBlock[2]); //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", blockNumber[0], blockNumber[1], blockNumber[2]); for (uint64_t i = 0; i < 3; i++) if (blockNumber[i] == 1) blockSize[i] = dispatchBlock[i]; for (uint64_t i = 0; i < blockNumber[0]; i++) { for (uint64_t j = 0; j < blockNumber[1]; j++) { for (uint64_t k = 0; k < blockNumber[2]; k++) { if (axis->pushConstants.workGroupShift[0] != i * blockSize[0]) { axis->pushConstants.workGroupShift[0] = i * blockSize[0]; axis->updatePushConstants = 1; } if (axis->pushConstants.workGroupShift[1] != j * blockSize[1]) { axis->pushConstants.workGroupShift[1] = j * blockSize[1]; axis->updatePushConstants = 1; } if (axis->pushConstants.workGroupShift[2] != k * blockSize[2]) { axis->pushConstants.workGroupShift[2] = k * blockSize[2]; axis->updatePushConstants = 1; } if (axis->updatePushConstants) { if (app->configuration.useUint64) { uint64_t pushConstID = 0; if (axis->specializationConstants.performWorkGroupShift[0]) { axis->pushConstants.dataUint64[pushConstID] = axis->pushConstants.workGroupShift[0]; pushConstID++; } if (axis->specializationConstants.performWorkGroupShift[1]) { axis->pushConstants.dataUint64[pushConstID] = axis->pushConstants.workGroupShift[1]; pushConstID++; } if (axis->specializationConstants.performWorkGroupShift[2]) { axis->pushConstants.dataUint64[pushConstID] = axis->pushConstants.workGroupShift[2]; pushConstID++; } if (axis->specializationConstants.performPostCompilationInputOffset) { axis->pushConstants.dataUint64[pushConstID] = axis->specializationConstants.inputOffset / axis->specializationConstants.inputNumberByteSize; pushConstID++; } if (axis->specializationConstants.performPostCompilationOutputOffset) { axis->pushConstants.dataUint64[pushConstID] = axis->specializationConstants.outputOffset / axis->specializationConstants.outputNumberByteSize; pushConstID++; } if (axis->specializationConstants.performPostCompilationKernelOffset) { if (axis->specializationConstants.kernelNumberByteSize != 0) axis->pushConstants.dataUint64[pushConstID] = axis->specializationConstants.kernelOffset / axis->specializationConstants.kernelNumberByteSize; else axis->pushConstants.dataUint64[pushConstID] = 0; pushConstID++; } } else { uint64_t pushConstID = 0; if (axis->specializationConstants.performWorkGroupShift[0]) { axis->pushConstants.dataUint32[pushConstID] = (uint32_t)axis->pushConstants.workGroupShift[0]; pushConstID++; } if (axis->specializationConstants.performWorkGroupShift[1]) { axis->pushConstants.dataUint32[pushConstID] = (uint32_t)axis->pushConstants.workGroupShift[1]; pushConstID++; } if (axis->specializationConstants.performWorkGroupShift[2]) { axis->pushConstants.dataUint32[pushConstID] = (uint32_t)axis->pushConstants.workGroupShift[2]; pushConstID++; } if (axis->specializationConstants.performPostCompilationInputOffset) { axis->pushConstants.dataUint32[pushConstID] = (uint32_t)(axis->specializationConstants.inputOffset / axis->specializationConstants.inputNumberByteSize); pushConstID++; } if (axis->specializationConstants.performPostCompilationOutputOffset) { axis->pushConstants.dataUint32[pushConstID] = (uint32_t)(axis->specializationConstants.outputOffset / axis->specializationConstants.outputNumberByteSize); pushConstID++; } if (axis->specializationConstants.performPostCompilationKernelOffset) { if (axis->specializationConstants.kernelNumberByteSize != 0) axis->pushConstants.dataUint32[pushConstID] = (uint32_t)(axis->specializationConstants.kernelOffset / axis->specializationConstants.kernelNumberByteSize); else axis->pushConstants.dataUint64[pushConstID] = 0; pushConstID++; } } } dispatchSize[0] = (i == blockNumber[0] - 1) ? lastBlockSize[0] : blockSize[0]; dispatchSize[1] = (j == blockNumber[1] - 1) ? lastBlockSize[1] : blockSize[1]; dispatchSize[2] = (k == blockNumber[2] - 1) ? lastBlockSize[2] : blockSize[2]; #if(VKFFT_BACKEND==0) if (axis->pushConstants.structSize > 0) { if (app->configuration.useUint64) { vkCmdPushConstants(app->configuration.commandBuffer[0], axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)axis->pushConstants.structSize, axis->pushConstants.dataUint64); } else { vkCmdPushConstants(app->configuration.commandBuffer[0], axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)axis->pushConstants.structSize, axis->pushConstants.dataUint32); } } vkCmdDispatch(app->configuration.commandBuffer[0], (uint32_t)dispatchSize[0], (uint32_t)dispatchSize[1], (uint32_t)dispatchSize[2]); #elif(VKFFT_BACKEND==1) void* args[10]; CUresult result = CUDA_SUCCESS; args[0] = axis->inputBuffer; args[1] = axis->outputBuffer; uint64_t args_id = 2; if (axis->specializationConstants.convolutionStep) { args[args_id] = app->configuration.kernel; args_id++; } if (axis->specializationConstants.LUT) { args[args_id] = &axis->bufferLUT; args_id++; } if (axis->specializationConstants.raderUintLUT) { args[args_id] = &axis->bufferRaderUintLUT; args_id++; } if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { if (axis->specializationConstants.inverseBluestein) args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; else args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; args_id++; } if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; args_id++; } //args[args_id] = &axis->pushConstants; if (axis->updatePushConstants) { axis->updatePushConstants = 0; if (axis->pushConstants.structSize > 0) { if (app->configuration.useUint64) { result = cuMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint64, axis->pushConstants.structSize); } else { result = cuMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint32, axis->pushConstants.structSize); } if (result != CUDA_SUCCESS) { printf("cuMemcpyHtoD error: %d\n", result); return VKFFT_ERROR_FAILED_TO_COPY; } } } if (app->configuration.num_streams >= 1) { result = cuLaunchKernel(axis->VkFFTKernel, (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim (unsigned int)axis->specializationConstants.usedSharedMemory, app->configuration.stream[app->configuration.streamID], // shared mem and stream args, 0); } else { result = cuLaunchKernel(axis->VkFFTKernel, (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim (unsigned int)axis->specializationConstants.usedSharedMemory, 0, // shared mem and stream args, 0); } if (result != CUDA_SUCCESS) { printf("cuLaunchKernel error: %d, %" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", result, dispatchSize[0], dispatchSize[1], dispatchSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; } if (app->configuration.num_streams > 1) { app->configuration.streamID = app->configuration.streamCounter % app->configuration.num_streams; if (app->configuration.streamCounter == 0) { cudaError_t res2 = cudaEventRecord(app->configuration.stream_event[app->configuration.streamID], app->configuration.stream[app->configuration.streamID]); if (res2 != cudaSuccess) return VKFFT_ERROR_FAILED_TO_EVENT_RECORD; } app->configuration.streamCounter++; } #elif(VKFFT_BACKEND==2) hipError_t result = hipSuccess; void* args[10]; args[0] = axis->inputBuffer; args[1] = axis->outputBuffer; uint64_t args_id = 2; if (axis->specializationConstants.convolutionStep) { args[args_id] = app->configuration.kernel; args_id++; } if (axis->specializationConstants.LUT) { args[args_id] = &axis->bufferLUT; args_id++; } if (axis->specializationConstants.raderUintLUT) { args[args_id] = &axis->bufferRaderUintLUT; args_id++; } if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { if (axis->specializationConstants.inverseBluestein) args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; else args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; args_id++; } if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; args_id++; } //args[args_id] = &axis->pushConstants; if (axis->updatePushConstants) { axis->updatePushConstants = 0; if (axis->pushConstants.structSize > 0) { if (app->configuration.useUint64) { result = hipMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint64, axis->pushConstants.structSize); } else { result = hipMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint32, axis->pushConstants.structSize); } if (result != hipSuccess) { printf("hipMemcpyHtoD error: %d\n", result); return VKFFT_ERROR_FAILED_TO_COPY; } } } //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); if (app->configuration.num_streams >= 1) { result = hipModuleLaunchKernel(axis->VkFFTKernel, (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim (unsigned int)axis->specializationConstants.usedSharedMemory, app->configuration.stream[app->configuration.streamID], // shared mem and stream args, 0); } else { result = hipModuleLaunchKernel(axis->VkFFTKernel, (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim (unsigned int)axis->specializationConstants.usedSharedMemory, 0, // shared mem and stream args, 0); } if (result != hipSuccess) { printf("hipModuleLaunchKernel error: %d, %" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", result, dispatchSize[0], dispatchSize[1], dispatchSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; } if (app->configuration.num_streams > 1) { app->configuration.streamID = app->configuration.streamCounter % app->configuration.num_streams; if (app->configuration.streamCounter == 0) { result = hipEventRecord(app->configuration.stream_event[app->configuration.streamID], app->configuration.stream[app->configuration.streamID]); if (result != hipSuccess) return VKFFT_ERROR_FAILED_TO_EVENT_RECORD; } app->configuration.streamCounter++; } #elif(VKFFT_BACKEND==3) cl_int result = CL_SUCCESS; void* args[10]; args[0] = axis->inputBuffer; result = clSetKernelArg(axis->kernel, 0, sizeof(cl_mem), args[0]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args[1] = axis->outputBuffer; result = clSetKernelArg(axis->kernel, 1, sizeof(cl_mem), args[1]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } uint64_t args_id = 2; if (axis->specializationConstants.convolutionStep) { args[args_id] = app->configuration.kernel; result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.LUT) { args[args_id] = &axis->bufferLUT; result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.raderUintLUT) { args[args_id] = &axis->bufferRaderUintLUT; result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { if (axis->specializationConstants.inverseBluestein) args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; else args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->pushConstants.structSize > 0) { if (app->configuration.useUint64) { result = clSetKernelArg(axis->kernel, (cl_uint)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint64); } else { result = clSetKernelArg(axis->kernel, (cl_uint)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint32); } if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } size_t local_work_size[3] = { (size_t)axis->specializationConstants.localSize[0], (size_t)axis->specializationConstants.localSize[1],(size_t)axis->specializationConstants.localSize[2] }; size_t global_work_size[3] = { (size_t)dispatchSize[0] * local_work_size[0] , (size_t)dispatchSize[1] * local_work_size[1] ,(size_t)dispatchSize[2] * local_work_size[2] }; result = clEnqueueNDRangeKernel(app->configuration.commandQueue[0], axis->kernel, 3, 0, global_work_size, local_work_size, 0, 0, 0); //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); if (result != CL_SUCCESS) { return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; } #elif(VKFFT_BACKEND==4) ze_result_t result = ZE_RESULT_SUCCESS; void* args[10]; args[0] = axis->inputBuffer; result = zeKernelSetArgumentValue(axis->VkFFTKernel, 0, sizeof(void*), args[0]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args[1] = axis->outputBuffer; result = zeKernelSetArgumentValue(axis->VkFFTKernel, 1, sizeof(void*), args[1]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } uint64_t args_id = 2; if (axis->specializationConstants.convolutionStep) { args[args_id] = app->configuration.kernel; result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.LUT) { args[args_id] = &axis->bufferLUT; result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.raderUintLUT) { args[args_id] = &axis->bufferRaderUintLUT; result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { if (axis->specializationConstants.inverseBluestein) args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; else args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } if (axis->pushConstants.structSize > 0) { if (app->configuration.useUint64) { result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint64); } else { result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint32); } if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; } args_id++; } size_t local_work_size[3] = { (size_t)axis->specializationConstants.localSize[0], (size_t)axis->specializationConstants.localSize[1],(size_t)axis->specializationConstants.localSize[2] }; ze_group_count_t launchArgs = { (uint32_t)dispatchSize[0], (uint32_t)dispatchSize[1],(uint32_t)dispatchSize[2] }; result = zeCommandListAppendLaunchKernel(app->configuration.commandList[0], axis->VkFFTKernel, &launchArgs, 0, 0, 0); //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); if (result != ZE_RESULT_SUCCESS) { return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; } #elif(VKFFT_BACKEND==5) app->configuration.commandEncoder->setComputePipelineState(axis->pipeline); void* args[10]; app->configuration.commandEncoder->setBuffer(axis->inputBuffer[0], 0, 0); app->configuration.commandEncoder->setBuffer(axis->outputBuffer[0], 0, 1); app->configuration.commandEncoder->setThreadgroupMemoryLength((uint64_t)ceil(axis->specializationConstants.usedSharedMemory / 16.0) * 16, 0); uint64_t args_id = 2; if (axis->specializationConstants.convolutionStep) { app->configuration.commandEncoder->setBuffer(app->configuration.kernel[0], 0, args_id); args_id++; } if (axis->specializationConstants.LUT) { app->configuration.commandEncoder->setBuffer(axis->bufferLUT, 0, args_id); args_id++; } if (axis->specializationConstants.raderUintLUT) { app->configuration.commandEncoder->setBuffer(axis->bufferRaderUintLUT, 0, args_id); args_id++; } if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { if (axis->specializationConstants.inverseBluestein) app->configuration.commandEncoder->setBuffer(app->bufferBluesteinIFFT[axis->specializationConstants.axis_id], 0, args_id); else app->configuration.commandEncoder->setBuffer(app->bufferBluesteinFFT[axis->specializationConstants.axis_id], 0, args_id); args_id++; } if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { app->configuration.commandEncoder->setBuffer(app->bufferBluestein[axis->specializationConstants.axis_id], 0, args_id); args_id++; } //args[args_id] = &axis->pushConstants; if (axis->pushConstants.structSize > 0) { if (app->configuration.useUint64) { if (!axis->pushConstants.dataUintBuffer) { axis->pushConstants.dataUintBuffer = app->configuration.device->newBuffer(axis->pushConstants.structSize, MTL::ResourceStorageModeShared); memcpy(axis->pushConstants.dataUintBuffer->contents(), axis->pushConstants.dataUint64, axis->pushConstants.structSize); axis->updatePushConstants = 0; } else if (axis->updatePushConstants) { memcpy(axis->pushConstants.dataUintBuffer->contents(), axis->pushConstants.dataUint64, axis->pushConstants.structSize); axis->updatePushConstants = 0; } app->configuration.commandEncoder->setBuffer(axis->pushConstants.dataUintBuffer, 0, args_id); } else { if (!axis->pushConstants.dataUintBuffer) { axis->pushConstants.dataUintBuffer = app->configuration.device->newBuffer(axis->pushConstants.structSize, MTL::ResourceStorageModeShared); memcpy(axis->pushConstants.dataUintBuffer->contents(), axis->pushConstants.dataUint32, axis->pushConstants.structSize); axis->updatePushConstants = 0; } else if (axis->updatePushConstants) { memcpy(axis->pushConstants.dataUintBuffer->contents(), axis->pushConstants.dataUint32, axis->pushConstants.structSize); axis->updatePushConstants = 0; } app->configuration.commandEncoder->setBuffer(axis->pushConstants.dataUintBuffer, 0, args_id); } args_id++; } MTL::Size threadsPerGrid = { dispatchSize[0] * axis->specializationConstants.localSize[0], dispatchSize[1] * axis->specializationConstants.localSize[1],dispatchSize[2] * axis->specializationConstants.localSize[2] }; MTL::Size threadsPerThreadgroup = { axis->specializationConstants.localSize[0],axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2] }; app->configuration.commandEncoder->dispatchThreads(threadsPerGrid, threadsPerThreadgroup); #endif } } } return resFFT; } static inline VkFFTResult VkFFTSync(VkFFTApplication* app) { #if(VKFFT_BACKEND==0) vkCmdPipelineBarrier(app->configuration.commandBuffer[0], VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, app->configuration.memory_barrier, 0, 0, 0, 0); #elif(VKFFT_BACKEND==1) if (app->configuration.num_streams > 1) { cudaError_t res = cudaSuccess; for (uint64_t s = 0; s < app->configuration.num_streams; s++) { res = cudaEventSynchronize(app->configuration.stream_event[s]); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } app->configuration.streamCounter = 0; } #elif(VKFFT_BACKEND==2) if (app->configuration.num_streams > 1) { hipError_t res = hipSuccess; for (uint64_t s = 0; s < app->configuration.num_streams; s++) { res = hipEventSynchronize(app->configuration.stream_event[s]); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; } app->configuration.streamCounter = 0; } #elif(VKFFT_BACKEND==3) #elif(VKFFT_BACKEND==4) ze_result_t res = ZE_RESULT_SUCCESS; res = zeCommandListAppendBarrier(app->configuration.commandList[0], nullptr, 0, nullptr); if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER; #elif(VKFFT_BACKEND==5) #endif return VKFFT_SUCCESS; } static inline void printDebugInformation(VkFFTApplication* app, VkFFTAxis* axis) { if (app->configuration.keepShaderCode) printf("%s\n", axis->specializationConstants.code0); if (app->configuration.printMemoryLayout) { if ((axis->inputBuffer == app->configuration.inputBuffer) && (app->configuration.inputBuffer != app->configuration.buffer)) printf("read: inputBuffer\n"); if (axis->inputBuffer == app->configuration.buffer) printf("read: buffer\n"); if (axis->inputBuffer == app->configuration.tempBuffer) printf("read: tempBuffer\n"); if ((axis->inputBuffer == app->configuration.outputBuffer) && (app->configuration.outputBuffer != app->configuration.buffer)) printf("read: outputBuffer\n"); if ((axis->outputBuffer == app->configuration.inputBuffer) && (app->configuration.inputBuffer != app->configuration.buffer)) printf("write: inputBuffer\n"); if (axis->outputBuffer == app->configuration.buffer) printf("write: buffer\n"); if (axis->outputBuffer == app->configuration.tempBuffer) printf("write: tempBuffer\n"); if ((axis->outputBuffer == app->configuration.outputBuffer) && (app->configuration.outputBuffer != app->configuration.buffer)) printf("write: outputBuffer\n"); } } static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams* launchParams) { VkFFTResult resFFT = VKFFT_SUCCESS; #if(VKFFT_BACKEND==0) app->configuration.commandBuffer = launchParams->commandBuffer; VkMemoryBarrier memory_barrier = { VK_STRUCTURE_TYPE_MEMORY_BARRIER, 0, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, }; app->configuration.memory_barrier = &memory_barrier; #elif(VKFFT_BACKEND==1) app->configuration.streamCounter = 0; #elif(VKFFT_BACKEND==2) app->configuration.streamCounter = 0; #elif(VKFFT_BACKEND==3) app->configuration.commandQueue = launchParams->commandQueue; #elif(VKFFT_BACKEND==4) app->configuration.commandList = launchParams->commandList; #elif(VKFFT_BACKEND==5) app->configuration.commandBuffer = launchParams->commandBuffer; app->configuration.commandEncoder = launchParams->commandEncoder; #endif uint64_t localSize0[3]; if ((inverse != 1) && (app->configuration.makeInversePlanOnly)) return VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED; if ((inverse == 1) && (app->configuration.makeForwardPlanOnly)) return VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED; if ((inverse != 1) && (!app->configuration.makeInversePlanOnly) && (!app->localFFTPlan)) return VKFFT_ERROR_PLAN_NOT_INITIALIZED; if ((inverse == 1) && (!app->configuration.makeForwardPlanOnly) && (!app->localFFTPlan_inverse)) return VKFFT_ERROR_PLAN_NOT_INITIALIZED; if (inverse == 1) { localSize0[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0]; localSize0[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][0]; localSize0[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][0]; } else { localSize0[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0]; localSize0[1] = app->localFFTPlan->actualFFTSizePerAxis[1][0]; localSize0[2] = app->localFFTPlan->actualFFTSizePerAxis[2][0]; } resFFT = VkFFTCheckUpdateBufferSet(app, 0, 0, launchParams); if (resFFT != VKFFT_SUCCESS) { return resFFT; } if (inverse != 1) { //FFT axis 0 if (!app->configuration.omitDimension[0]) { for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[0] - 1; l >= 0; l--) { VkFFTAxis* axis = &app->localFFTPlan->axes[0][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; if (l == 0) { if (app->localFFTPlan->numAxisUploads[0] > 2) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1]; dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; } else { if (app->localFFTPlan->numAxisUploads[0] > 1) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; } else { dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); } } } else { dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; } dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[0][2] * maxCoordinate * app->configuration.numberBatches; if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } if (app->useBluesteinFFT[0] && (app->localFFTPlan->numAxisUploads[0] > 1)) { for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[0]; l++) { VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[0][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; if (l == 0) { if (app->localFFTPlan->numAxisUploads[0] > 2) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1]; dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; } else { if (app->localFFTPlan->numAxisUploads[0] > 1) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; } else { dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); } } } else { dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; } dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[0][2] * maxCoordinate * app->configuration.numberBatches; if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } if (app->localFFTPlan->multiUploadR2C) { VkFFTAxis* axis = &app->localFFTPlan->R2Cdecomposition; resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan, axis, 0, 0, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2 + 1) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])); dispatchBlock[1] = 1; dispatchBlock[2] = maxCoordinate * axis->specializationConstants.numBatches; resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; //app->configuration.size[0] *= 2; } } if (app->configuration.FFTdim > 1) { //FFT axis 1 if (!app->configuration.omitDimension[1]) { if ((app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) { for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) { VkFFTAxis* axis = &app->localFFTPlan->axes[1][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[1][2] * maxCoordinate * app->configuration.numberBatches; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } else { for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) { VkFFTAxis* axis = &app->localFFTPlan->axes[1][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } if (app->useBluesteinFFT[1] && (app->localFFTPlan->numAxisUploads[1] > 1)) { for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[1]; l++) { VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[1][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } } } } //FFT axis 2 if (app->configuration.FFTdim > 2) { if (!app->configuration.omitDimension[2]) { if ((app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) { for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) { VkFFTAxis* axis = &app->localFFTPlan->axes[2][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[2][1] * maxCoordinate * app->configuration.numberBatches; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } else { for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) { VkFFTAxis* axis = &app->localFFTPlan->axes[2][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } if (app->useBluesteinFFT[2] && (app->localFFTPlan->numAxisUploads[2] > 1)) { for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[2]; l++) { VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[2][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } } } } } if (app->configuration.performConvolution) { if (app->configuration.FFTdim > 2) { //multiple upload ifft leftovers if (app->configuration.FFTdim == 3) { for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[2]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberKernels; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } for (int64_t l = 0; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } if (app->configuration.FFTdim > 1) { if (app->configuration.FFTdim == 2) { for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } if (app->localFFTPlan_inverse->multiUploadR2C) { //app->configuration.size[0] /= 2; VkFFTAxis* axis = &app->localFFTPlan_inverse->R2Cdecomposition; resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan_inverse, axis, 0, 0, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2 + 1) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])); dispatchBlock[1] = 1; dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches; resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } for (int64_t l = 0; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; if (l == 0) { if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1]; dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } else { if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } else { dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); } } } else { dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } if (app->configuration.FFTdim == 1) { for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } } if (inverse == 1) { //we start from axis 2 and go back to axis 0 //FFT axis 2 if (app->configuration.FFTdim > 2) { if (!app->configuration.omitDimension[2]) { for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[2] - 1; l >= 0; l--) { //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l; VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); //if (app->configuration.performZeropaddingInverse[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l; } if (app->useBluesteinFFT[2] && (app->localFFTPlan_inverse->numAxisUploads[2] > 1)) { for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[2]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[2][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } } } if (app->configuration.FFTdim > 1) { //FFT axis 1 if (!app->configuration.omitDimension[1]) { for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[1] - 1; l >= 0; l--) { //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[1])) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l; VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[1])) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l; resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } if (app->useBluesteinFFT[1] && (app->localFFTPlan_inverse->numAxisUploads[1] > 1)) { for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[1][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); dispatchBlock[1] = 1; dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } } } if (!app->configuration.omitDimension[0]) { if (app->localFFTPlan_inverse->multiUploadR2C) { //app->configuration.size[0] /= 2; VkFFTAxis* axis = &app->localFFTPlan_inverse->R2Cdecomposition; resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan_inverse, axis, 0, 0, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2 + 1) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])); dispatchBlock[1] = 1; dispatchBlock[2] = app->configuration.coordinateFeatures * axis->specializationConstants.numBatches; resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } //FFT axis 0 for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[0] - 1; l >= 0; l--) { //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[0])) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l; VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; if (l == 0) { if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1]; dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } else { if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } else { dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); } } } else { dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[0])) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l; resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } if (app->useBluesteinFFT[0] && (app->localFFTPlan_inverse->numAxisUploads[0] > 1)) { for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) { VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[0][l]; resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); if (resFFT != VKFFT_SUCCESS) return resFFT; #if(VKFFT_BACKEND==0) vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); #endif uint64_t dispatchBlock[3]; if (l == 0) { if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1]; dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } else { if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) { dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } else { dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); } } } else { dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; } dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); resFFT = dispatchEnhanced(app, axis, dispatchBlock); if (resFFT != VKFFT_SUCCESS) return resFFT; printDebugInformation(app, axis); resFFT = VkFFTSync(app); if (resFFT != VKFFT_SUCCESS) return resFFT; } } } //if (app->localFFTPlan_inverse->multiUploadR2C) app->configuration.size[0] *= 2; } return resFFT; } static inline int VkFFTGetVersion() { return 10233; //X.XX.XX format } #endif