Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
7b3072df
Commit
7b3072df
authored
Sep 08, 2010
by
Peter Eastman
Browse files
Continuing implementation of PME reciprocal space calculation
parent
ddf4e3b4
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
260 additions
and
25 deletions
+260
-25
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
+1
-0
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
...ins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
+1
-0
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
+2
-0
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
...eba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
+108
-25
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
.../platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
+148
-0
No files found.
plugins/amoeba/platforms/cuda/src/kernels/AmoebaGpu.cpp
View file @
7b3072df
...
@@ -1374,6 +1374,7 @@ void gpuElectrostaticAllocate( amoebaGpuContext amoebaGpu )
...
@@ -1374,6 +1374,7 @@ void gpuElectrostaticAllocate( amoebaGpuContext amoebaGpu )
amoebaGpu
->
psForce
=
new
CUDAStream
<
float
>
(
paddedNumberOfAtoms
*
3
,
1
,
"ElectrostaticForce"
);
amoebaGpu
->
psForce
=
new
CUDAStream
<
float
>
(
paddedNumberOfAtoms
*
3
,
1
,
"ElectrostaticForce"
);
amoebaGpu
->
psTorque
=
new
CUDAStream
<
float
>
(
paddedNumberOfAtoms
*
3
,
1
,
"Torque"
);
amoebaGpu
->
psTorque
=
new
CUDAStream
<
float
>
(
paddedNumberOfAtoms
*
3
,
1
,
"Torque"
);
amoebaGpu
->
amoebaSim
.
pTorque
=
amoebaGpu
->
psTorque
->
_pDevData
;
unsigned
int
offset
=
3
*
paddedNumberOfAtoms
*
sizeof
(
float
);
unsigned
int
offset
=
3
*
paddedNumberOfAtoms
*
sizeof
(
float
);
memset
(
amoebaGpu
->
psForce
->
_pSysStream
[
0
],
0
,
offset
);
memset
(
amoebaGpu
->
psForce
->
_pSysStream
[
0
],
0
,
offset
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaKernels.h
View file @
7b3072df
...
@@ -110,6 +110,7 @@ extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
...
@@ -110,6 +110,7 @@ extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern
void
GetCalculateAmoebaCudaMapTorquesSim
(
amoebaGpuContext
gpu
);
extern
void
GetCalculateAmoebaCudaMapTorquesSim
(
amoebaGpuContext
gpu
);
extern
void
cudaComputeAmoebaMapTorques
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float
>*
psForce
);
extern
void
cudaComputeAmoebaMapTorques
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float
>*
psForce
);
extern
void
cudaComputeAmoebaMapTorquesAndAddTotalForce
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float
>*
psForce
,
CUDAStream
<
float4
>*
psOutputForce
);
extern
void
cudaComputeAmoebaMapTorquesAndAddTotalForce
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float
>*
psForce
,
CUDAStream
<
float4
>*
psOutputForce
);
extern
void
cudaComputeAmoebaMapTorquesAndAddTotalForce2
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float4
>*
psOutputForce
);
extern
void
SetCalculateAmoebaKirkwoodSim
(
amoebaGpuContext
amoebaGpu
);
extern
void
SetCalculateAmoebaKirkwoodSim
(
amoebaGpuContext
amoebaGpu
);
extern
void
GetCalculateAmoebaKirkwoodSim
(
amoebaGpuContext
amoebaGpu
);
extern
void
GetCalculateAmoebaKirkwoodSim
(
amoebaGpuContext
amoebaGpu
);
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaCudaTypes.h
View file @
7b3072df
...
@@ -140,6 +140,8 @@ struct cudaAmoebaGmxSimulation {
...
@@ -140,6 +140,8 @@ struct cudaAmoebaGmxSimulation {
float
*
pInducedDipoleS
;
float
*
pInducedDipoleS
;
float
*
pInducedDipolePolarS
;
float
*
pInducedDipolePolarS
;
float
*
pTorque
;
float
*
pWorkArray_3_1
;
float
*
pWorkArray_3_1
;
float
*
pWorkArray_3_2
;
float
*
pWorkArray_3_2
;
float
*
pWorkArray_1_1
;
float
*
pWorkArray_1_1
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaCudaPME.cu
View file @
7b3072df
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#include "amoebaGpuTypes.h"
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "amoebaCudaKernels.h"
#include "bbsort.h"
//#define AMOEBA_DEBUG
//#define AMOEBA_DEBUG
...
@@ -11,7 +12,7 @@ static __constant__ cudaGmxSimulation cSim;
...
@@ -11,7 +12,7 @@ static __constant__ cudaGmxSimulation cSim;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
static
__constant__
cudaAmoebaGmxSimulation
cAmoebaSim
;
/* Cuda compiler on Windows does not recognized "static const float" values */
/* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
#define LOCAL_HACK_PI 3.1415926535897932384626433832795
f
void
SetCalculateAmoebaPMESim
(
amoebaGpuContext
amoebaGpu
)
void
SetCalculateAmoebaPMESim
(
amoebaGpuContext
amoebaGpu
)
{
{
...
@@ -100,13 +101,13 @@ __device__ void computeBSplinePoint(float4* thetai, float w, float* array)
...
@@ -100,13 +101,13 @@ __device__ void computeBSplinePoint(float4* thetai, float w, float* array)
*/
*/
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
512
,
1
)
#elif (__CUDA_ARCH__ >= 120)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
256
,
1
)
#else
#else
__launch_bounds__
(
128
,
1
)
__launch_bounds__
(
128
,
1
)
#endif
#endif
void
kComputeBsplines_kernel
()
void
kCompute
Amoeba
Bsplines_kernel
()
{
{
extern
__shared__
float
bsplines_cache
[];
// size = block_size*pme_order*pme_order
extern
__shared__
float
bsplines_cache
[];
// size = block_size*pme_order*pme_order
float
*
array
=
&
bsplines_cache
[
threadIdx
.
x
*
AMOEBA_PME_ORDER
*
AMOEBA_PME_ORDER
];
float
*
array
=
&
bsplines_cache
[
threadIdx
.
x
*
AMOEBA_PME_ORDER
*
AMOEBA_PME_ORDER
];
...
@@ -158,11 +159,11 @@ void kComputeBsplines_kernel()
...
@@ -158,11 +159,11 @@ void kComputeBsplines_kernel()
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
768
,
1
)
#elif (__CUDA_ARCH__ >= 120)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
__launch_bounds__
(
384
,
1
)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
192
,
1
)
#endif
#endif
void
kGridSpreadFixedMultipoles_kernel
()
void
kGridSpreadFixedMultipoles_kernel
()
{
{
...
@@ -252,11 +253,11 @@ void kGridSpreadFixedMultipoles_kernel()
...
@@ -252,11 +253,11 @@ void kGridSpreadFixedMultipoles_kernel()
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
768
,
1
)
#elif (__CUDA_ARCH__ >= 120)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
__launch_bounds__
(
384
,
1
)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
192
,
1
)
#endif
#endif
void
kGridSpreadInducedDipoles_kernel
()
void
kGridSpreadInducedDipoles_kernel
()
{
{
...
@@ -342,11 +343,11 @@ void kGridSpreadInducedDipoles_kernel()
...
@@ -342,11 +343,11 @@ void kGridSpreadInducedDipoles_kernel()
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
768
,
1
)
#elif (__CUDA_ARCH__ >= 1
3
0)
#elif (__CUDA_ARCH__ >= 1
2
0)
__launch_bounds__
(
512
,
1
)
__launch_bounds__
(
384
,
1
)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
192
,
1
)
#endif
#endif
void
kAmoebaReciprocalConvolution_kernel
()
void
kAmoebaReciprocalConvolution_kernel
()
{
{
...
@@ -380,11 +381,11 @@ void kAmoebaReciprocalConvolution_kernel()
...
@@ -380,11 +381,11 @@ void kAmoebaReciprocalConvolution_kernel()
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
768
,
1
)
#elif (__CUDA_ARCH__ >= 120)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
__launch_bounds__
(
384
,
1
)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
192
,
1
)
#endif
#endif
void
kComputeFixedPotentialFromGrid_kernel
()
void
kComputeFixedPotentialFromGrid_kernel
()
{
{
...
@@ -496,11 +497,11 @@ void kComputeFixedPotentialFromGrid_kernel()
...
@@ -496,11 +497,11 @@ void kComputeFixedPotentialFromGrid_kernel()
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
768
,
1
)
#elif (__CUDA_ARCH__ >= 120)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
__launch_bounds__
(
384
,
1
)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
192
,
1
)
#endif
#endif
void
kComputeInducedPotentialFromGrid_kernel
()
void
kComputeInducedPotentialFromGrid_kernel
()
{
{
...
@@ -702,11 +703,11 @@ void kComputeInducedPotentialFromGrid_kernel()
...
@@ -702,11 +703,11 @@ void kComputeInducedPotentialFromGrid_kernel()
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
768
,
1
)
#elif (__CUDA_ARCH__ >= 120)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
__launch_bounds__
(
384
,
1
)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
192
,
1
)
#endif
#endif
void
kComputeFixedMultipoleForceAndEnergy_kernel
()
void
kComputeFixedMultipoleForceAndEnergy_kernel
()
{
{
...
@@ -716,6 +717,8 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
...
@@ -716,6 +717,8 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
const
int
deriv3
[]
=
{
4
,
9
,
10
,
7
,
15
,
17
,
13
,
20
,
18
,
19
};
const
int
deriv3
[]
=
{
4
,
9
,
10
,
7
,
15
,
17
,
13
,
20
,
18
,
19
};
float
energy
=
0.0
f
;
float
energy
=
0.0
f
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
cSim
.
atoms
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
cSim
.
atoms
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the force and energy.
multipole
[
0
]
=
cSim
.
pPosq
[
i
].
w
;
multipole
[
0
]
=
cSim
.
pPosq
[
i
].
w
;
multipole
[
1
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
];
multipole
[
1
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
];
multipole
[
2
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
+
1
];
multipole
[
2
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
+
1
];
...
@@ -741,17 +744,32 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
...
@@ -741,17 +744,32 @@ void kComputeFixedMultipoleForceAndEnergy_kernel()
force
.
y
+=
f
.
y
;
force
.
y
+=
f
.
y
;
force
.
z
+=
f
.
z
;
force
.
z
+=
f
.
z
;
cSim
.
pForce4
[
i
]
=
force
;
cSim
.
pForce4
[
i
]
=
force
;
// Compute the torque.
cAmoebaSim
.
pTorque
[
3
*
i
]
=
multipole
[
3
]
*
cAmoebaSim
.
pPhi
[
2
]
-
multipole
[
2
]
*
cAmoebaSim
.
pPhi
[
3
]
+
2.0
f
*
(
multipole
[
6
]
-
multipole
[
5
])
*
cAmoebaSim
.
pPhi
[
9
]
+
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
7
]
+
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
5
]
-
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
8
]
-
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
6
];
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
multipole
[
1
]
*
cAmoebaSim
.
pPhi
[
3
]
-
multipole
[
3
]
*
cAmoebaSim
.
pPhi
[
1
]
+
2.0
f
*
(
multipole
[
4
]
-
multipole
[
6
])
*
cAmoebaSim
.
pPhi
[
9
]
+
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
9
]
+
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
6
]
-
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
4
]
-
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
7
];
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
multipole
[
2
]
*
cAmoebaSim
.
pPhi
[
1
]
-
multipole
[
1
]
*
cAmoebaSim
.
pPhi
[
2
]
+
2.0
f
*
(
multipole
[
5
]
-
multipole
[
4
])
*
cAmoebaSim
.
pPhi
[
7
]
+
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
4
]
+
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
8
]
-
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
5
]
-
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
9
];
}
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
energy
;
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
energy
;
}
}
__global__
__global__
#if (__CUDA_ARCH__ >= 200)
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
1024
,
1
)
__launch_bounds__
(
768
,
1
)
#elif (__CUDA_ARCH__ >= 120)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__
(
512
,
1
)
__launch_bounds__
(
384
,
1
)
#else
#else
__launch_bounds__
(
256
,
1
)
__launch_bounds__
(
192
,
1
)
#endif
#endif
void
kComputeInducedDipoleForceAndEnergy_kernel
()
void
kComputeInducedDipoleForceAndEnergy_kernel
()
{
{
...
@@ -763,6 +781,8 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
...
@@ -763,6 +781,8 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
const
int
deriv3
[]
=
{
4
,
9
,
10
,
7
,
15
,
17
,
13
,
20
,
18
,
19
};
const
int
deriv3
[]
=
{
4
,
9
,
10
,
7
,
15
,
17
,
13
,
20
,
18
,
19
};
float
energy
=
0.0
f
;
float
energy
=
0.0
f
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
cSim
.
atoms
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
cSim
.
atoms
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the force and energy.
multipole
[
0
]
=
cSim
.
pPosq
[
i
].
w
;
multipole
[
0
]
=
cSim
.
pPosq
[
i
].
w
;
multipole
[
1
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
];
multipole
[
1
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
];
multipole
[
2
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
+
1
];
multipole
[
2
]
=
cAmoebaSim
.
pLabFrameDipole
[
i
+
3
+
1
];
...
@@ -802,6 +822,69 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
...
@@ -802,6 +822,69 @@ void kComputeInducedDipoleForceAndEnergy_kernel()
force
.
y
+=
f
.
y
;
force
.
y
+=
f
.
y
;
force
.
z
+=
f
.
z
;
force
.
z
+=
f
.
z
;
cSim
.
pForce4
[
i
]
=
force
;
cSim
.
pForce4
[
i
]
=
force
;
// Compute the torque.
cAmoebaSim
.
pTorque
[
3
*
i
]
=
multipole
[
3
]
*
cAmoebaSim
.
pPhi
[
2
]
-
multipole
[
2
]
*
cAmoebaSim
.
pPhi
[
3
]
+
2.0
f
*
(
multipole
[
6
]
-
multipole
[
5
])
*
cAmoebaSim
.
pPhi
[
9
]
+
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
7
]
+
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
5
]
-
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
8
]
-
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
6
];
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
multipole
[
1
]
*
cAmoebaSim
.
pPhi
[
3
]
-
multipole
[
3
]
*
cAmoebaSim
.
pPhi
[
1
]
+
2.0
f
*
(
multipole
[
4
]
-
multipole
[
6
])
*
cAmoebaSim
.
pPhi
[
9
]
+
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
9
]
+
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
6
]
-
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
4
]
-
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
7
];
cAmoebaSim
.
pTorque
[
3
*
i
+
1
]
=
multipole
[
2
]
*
cAmoebaSim
.
pPhi
[
1
]
-
multipole
[
1
]
*
cAmoebaSim
.
pPhi
[
2
]
+
2.0
f
*
(
multipole
[
5
]
-
multipole
[
4
])
*
cAmoebaSim
.
pPhi
[
7
]
+
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
4
]
+
multipole
[
9
]
*
cAmoebaSim
.
pPhi
[
8
]
-
multipole
[
7
]
*
cAmoebaSim
.
pPhi
[
5
]
-
multipole
[
8
]
*
cAmoebaSim
.
pPhi
[
9
];
}
}
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
energy
;
cSim
.
pEnergy
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
energy
;
}
}
extern
void
cudaComputeAmoebaMapTorquesAndAddTotalForce2
(
amoebaGpuContext
gpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float4
>*
psOutputForce
);
__global__
void
kFindAtomRangeForGrid_kernel
();
void
kCalculateAmoebaPME
(
amoebaGpuContext
amoebaGpu
)
{
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
int
threads
;
if
(
gpu
->
sm_version
>=
SM_20
)
threads
=
512
;
else
if
(
gpu
->
sm_version
>=
SM_12
)
threads
=
256
;
else
threads
=
128
;
kComputeAmoebaBsplines_kernel
<<<
gpu
->
sim
.
blocks
,
threads
,
threads
*
AMOEBA_PME_ORDER
*
AMOEBA_PME_ORDER
*
sizeof
(
float
)
>>>
();
LAUNCHERROR
(
"kComputeAmoebaBsplines"
);
bbSort
(
gpu
->
psPmeAtomGridIndex
->
_pDevData
,
gpu
->
natoms
);
kFindAtomRangeForGrid_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindAtomRangeForGrid"
);
// Perform PME for the fixed multipoles.
kGridSpreadFixedMultipoles_kernel
<<<
8
*
gpu
->
sim
.
blocks
,
64
>>>
();
LAUNCHERROR
(
"kGridSpreadFixedMultipoles"
);
cufftExecC2C
(
gpu
->
fftplan
,
gpu
->
psPmeGrid
->
_pDevData
,
gpu
->
psPmeGrid
->
_pDevData
,
CUFFT_FORWARD
);
kAmoebaReciprocalConvolution_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kAmoebaReciprocalConvolution"
);
cufftExecC2C
(
gpu
->
fftplan
,
gpu
->
psPmeGrid
->
_pDevData
,
gpu
->
psPmeGrid
->
_pDevData
,
CUFFT_INVERSE
);
kComputeFixedPotentialFromGrid_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeFixedPotentialFromGrid"
);
kComputeFixedMultipoleForceAndEnergy_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeFixedMultipoleForceAndEnergy"
);
cudaComputeAmoebaMapTorquesAndAddTotalForce2
(
amoebaGpu
,
amoebaGpu
->
psTorque
,
gpu
->
psForce4
);
// Perform PME for the induced dipoles.
kGridSpreadInducedDipoles_kernel
<<<
8
*
gpu
->
sim
.
blocks
,
64
>>>
();
LAUNCHERROR
(
"kGridSpreadInducedDipoles"
);
cufftExecC2C
(
gpu
->
fftplan
,
gpu
->
psPmeGrid
->
_pDevData
,
gpu
->
psPmeGrid
->
_pDevData
,
CUFFT_FORWARD
);
kAmoebaReciprocalConvolution_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
nonbond_threads_per_block
>>>
();
LAUNCHERROR
(
"kAmoebaReciprocalConvolution"
);
cufftExecC2C
(
gpu
->
fftplan
,
gpu
->
psPmeGrid
->
_pDevData
,
gpu
->
psPmeGrid
->
_pDevData
,
CUFFT_INVERSE
);
kComputeInducedPotentialFromGrid_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeInducedPotentialFromGrid"
);
kComputeInducedDipoleForceAndEnergy_kernel
<<<
gpu
->
sim
.
blocks
,
gpu
->
sim
.
update_threads_per_block
>>>
();
LAUNCHERROR
(
"kComputeInducedDipoleForceAndEnergy"
);
cudaComputeAmoebaMapTorquesAndAddTotalForce2
(
amoebaGpu
,
amoebaGpu
->
psTorque
,
gpu
->
psForce4
);
}
plugins/amoeba/platforms/cuda/src/kernels/kCalculateAmoebaMapTorques.cu
View file @
7b3072df
...
@@ -291,6 +291,59 @@ void amoebaMapTorqueReduce_kernel2(
...
@@ -291,6 +291,59 @@ void amoebaMapTorqueReduce_kernel2(
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__
(
GF1XX_THREADS_PER_BLOCK
,
1
)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__
(
GT2XX_THREADS_PER_BLOCK
,
1
)
#else
__launch_bounds__
(
G8X_THREADS_PER_BLOCK
,
1
)
#endif
void
amoebaMapTorqueReduce_kernel3
(
int
numThreads
,
int
numOfAtoms
,
int
maxDiff
,
float
*
tempElecForce
,
float4
*
outputForce
){
unsigned
int
tid
=
threadIdx
.
x
;
__shared__
float
sfx
[
BLOCK_SIZE
];
__shared__
float
sfy
[
BLOCK_SIZE
];
__shared__
float
sfz
[
BLOCK_SIZE
];
// load values then sum and add results to elecForce
if
(
tid
<
maxDiff
){
sfx
[
tid
]
=
tempElecForce
[
blockIdx
.
x
*
3
*
maxDiff
+
tid
*
3
];
sfy
[
tid
]
=
tempElecForce
[
blockIdx
.
x
*
3
*
maxDiff
+
tid
*
3
+
1
];
sfz
[
tid
]
=
tempElecForce
[
blockIdx
.
x
*
3
*
maxDiff
+
tid
*
3
+
2
];
}
else
{
sfx
[
tid
]
=
sfy
[
tid
]
=
sfz
[
tid
]
=
0.0
f
;
}
__syncthreads
();
for
(
unsigned
int
s
=
(
blockDim
.
x
)
/
2
;
s
!=
0
;
s
>>=
1
){
if
(
tid
<
s
){
sfx
[
tid
]
+=
sfx
[
tid
+
s
];
sfy
[
tid
]
+=
sfy
[
tid
+
s
];
sfz
[
tid
]
+=
sfz
[
tid
+
s
];
}
__syncthreads
();
}
if
(
tid
==
0
){
outputForce
[
blockIdx
.
x
].
x
+=
sfx
[
0
];
outputForce
[
blockIdx
.
x
].
y
+=
sfy
[
0
];
outputForce
[
blockIdx
.
x
].
z
+=
sfz
[
0
];
}
}
void
cudaComputeAmoebaMapTorques
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float
>*
psForce
)
void
cudaComputeAmoebaMapTorques
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float
>*
psForce
)
{
{
...
@@ -638,3 +691,98 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
...
@@ -638,3 +691,98 @@ void cudaComputeAmoebaMapTorquesAndAddTotalForce( amoebaGpuContext amoebaGpu,
#endif
#endif
}
}
void
cudaComputeAmoebaMapTorquesAndAddTotalForce2
(
amoebaGpuContext
amoebaGpu
,
CUDAStream
<
float
>*
psTorque
,
CUDAStream
<
float4
>*
psCudaForce4
)
{
// ---------------------------------------------------------------------------------------
//#define AMOEBA_DEBUG
#ifdef AMOEBA_DEBUG
static
const
char
*
methodName
=
"cudaComputeAmoebaMapTorquesAndAddTotalForce"
;
static
int
timestep
=
0
;
std
::
vector
<
int
>
fileId
;
timestep
++
;
fileId
.
resize
(
2
);
fileId
[
0
]
=
timestep
;
fileId
[
1
]
=
1
;
#endif
// check that BLOCK_SIZE >= amoebaGpu->maxMapTorqueDifference
if
(
amoebaGpu
->
maxMapTorqueDifference
>
BLOCK_SIZE
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"block size (%d) in amoebaMapTorqueReduce_kernel is too small ( > %d)! -- aborting.
\n
"
,
BLOCK_SIZE
,
amoebaGpu
->
maxMapTorqueDifference
);
exit
(
-
1
);
}
// ---------------------------------------------------------------------------------------
gpuContext
gpu
=
amoebaGpu
->
gpuContext
;
int
numThreads
=
min
(
256
,
(
gpu
->
natoms
));
int
numBlocks
=
1
+
(
gpu
->
natoms
/
numThreads
);
amoebaMapTorqueToForce_kernel
<<<
numBlocks
,
numThreads
>>>
(
gpu
->
natoms
,
gpu
->
psPosq4
->
_pDevStream
[
0
],
psTorque
->
_pDevStream
[
0
],
amoebaGpu
->
psMultipoleParticlesIdsAndAxisType
->
_pDevStream
[
0
],
amoebaGpu
->
maxMapTorqueDifference
,
amoebaGpu
->
torqueMapForce
->
_pDevStream
[
0
]
);
LAUNCHERROR
(
"AmoebaMapTrqKernel"
);
numBlocks
=
gpu
->
natoms
;
numThreads
=
amoebaGpu
->
maxMapTorqueDifferencePow2
;
amoebaMapTorqueReduce_kernel3
<<<
numBlocks
,
numThreads
>>>
(
numThreads
,
gpu
->
natoms
,
amoebaGpu
->
maxMapTorqueDifference
,
amoebaGpu
->
torqueMapForce
->
_pDevStream
[
0
],
psCudaForce4
->
_pDevStream
[
0
]
);
LAUNCHERROR
(
"amoebaMapTorqueReduce_kernel2"
);
#ifdef AMOEBA_DEBUG
if
(
amoebaGpu
->
log
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%s: numBlocks=%d numThreads=%d %d
\n
"
,
methodName
,
numBlocks
,
numThreads
,
amoebaGpu
->
maxMapTorqueDifferencePow2
);
(
void
)
fflush
(
amoebaGpu
->
log
);
amoebaGpu
->
psForce
->
Download
();
psCudaForce4
->
Download
();
amoebaGpu
->
psTorque
->
Download
();
int
maxPrint
=
10
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"Post torqueMap
\n
"
);
for
(
int
ii
=
0
;
ii
<
gpu
->
natoms
;
ii
++
){
(
void
)
fprintf
(
amoebaGpu
->
log
,
"%5d "
,
ii
);
int
indexOffset
=
ii
*
3
;
(
void
)
fprintf
(
amoebaGpu
->
log
,
"FTtl[%16.9e %16.9e %16.9e] "
,
psCudaForce4
->
_pSysStream
[
0
][
ii
].
x
,
psCudaForce4
->
_pSysStream
[
0
][
ii
].
y
,
psCudaForce4
->
_pSysStream
[
0
][
ii
].
z
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"F[%16.9e %16.9e %16.9e] "
,
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
psForce
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
(
void
)
fprintf
(
amoebaGpu
->
log
,
"T[%16.9e %16.9e %16.9e]
\n
"
,
amoebaGpu
->
psTorque
->
_pSysStream
[
0
][
indexOffset
],
amoebaGpu
->
psTorque
->
_pSysStream
[
0
][
indexOffset
+
1
],
amoebaGpu
->
psTorque
->
_pSysStream
[
0
][
indexOffset
+
2
]
);
if
(
ii
==
maxPrint
&&
(
gpu
->
natoms
-
maxPrint
)
>
ii
)
ii
=
gpu
->
natoms
-
maxPrint
;
}
(
void
)
fflush
(
amoebaGpu
->
log
);
}
if
(
1
){
//std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors
outputVector
;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloat4Array
(
gpu
->
natoms
,
4
,
gpu
->
psForce4
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psForce
,
outputVector
);
cudaLoadCudaFloatArray
(
gpu
->
natoms
,
3
,
amoebaGpu
->
psTorque
,
outputVector
);
cudaWriteVectorOfDoubleVectorsToFile
(
"CudaVacuumElecForce"
,
fileId
,
outputVector
);
}
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment