6 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
10 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
12 __device__
void atomicAdd(
float *,
float) { }
17 #include <hip/hip_runtime.h>
24 #define warps_per_block 8
25 #define atoms_per_warp 4
26 #define atoms_per_block (atoms_per_warp * warps_per_block)
30 {{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}};
34 {{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0,
35 20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1,
40 {{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42,
41 7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
42 0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504,
43 315, -1680, 315, 504, 21}, {0, 7, 392, 1715,
44 0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}};
48 float *coeffs =
new float[order*
order];
49 float *dcoeffs =
new float[order*
order];
51 static const float *scoeffs;
66 NAMD_die(
"unsupported PMEInterpOrder");
69 for (
int i=0, p=order-1; i<
order; ++i,--p ) {
70 for (
int j=0; j<
order; ++j ) {
71 double c = scoeffs[i*order+(order-1-j)];
74 coeffs[i*order+j] = c;
75 dcoeffs[i*order+j] = (double)p * c;
81 NAMD_die(
"cuda_init_bspline_coeffs static data checksum error");
82 cudaMalloc((
void**) c, order*order*
sizeof(
float));
83 cudaMalloc((
void**) dc, order*order*
sizeof(
float));
84 cudaMemcpy(*c, coeffs, order*order*
sizeof(
float), cudaMemcpyHostToDevice);
85 cudaMemcpy(*dc, dcoeffs, order*order*
sizeof(
float), cudaMemcpyHostToDevice);
90 #define BSPLINE_DEFS \
92 float a2d[order][order]; \
93 float a1d[order*order]; \
95 __shared__ volatile union { \
96 float a2d[atoms_per_warp][7]; \
97 float a1d[atoms_per_warp*7]; \
98 } atoms[warps_per_block]; \
99 if ( threadIdx.x < order*order ) { \
100 bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
105 #define WARP (threadIdx.x>>5)
106 #define THREAD (threadIdx.x&31)
107 #define FX bspline_factors[threadIdx.x>>5][0]
108 #define FY bspline_factors[threadIdx.x>>5][1]
109 #define FZ bspline_factors[threadIdx.x>>5][2]
110 #define DFX bspline_dfactors[threadIdx.x>>5][0]
111 #define DFY bspline_dfactors[threadIdx.x>>5][1]
112 #define DFZ bspline_dfactors[threadIdx.x>>5][2]
114 #define FX2 bspline_2factors[threadIdx.x>>5][0]
115 #define FY2 bspline_2factors[threadIdx.x>>5][1]
116 #define FZ2 bspline_2factors[threadIdx.x>>5][2]
118 #define ATOM atoms[threadIdx.x>>5].a2d[i_atom]
119 #define AX atoms[threadIdx.x>>5].a2d[i_atom][0]
120 #define AY atoms[threadIdx.x>>5].a2d[i_atom][1]
121 #define AZ atoms[threadIdx.x>>5].a2d[i_atom][2]
122 #define AQ atoms[threadIdx.x>>5].a2d[i_atom][3]
123 #define AI atoms[threadIdx.x>>5].a2d[i_atom][4]
124 #define AJ atoms[threadIdx.x>>5].a2d[i_atom][5]
125 #define AK atoms[threadIdx.x>>5].a2d[i_atom][6]
131 const float * __restrict__ coeffs, \
132 float *
const * __restrict__ q_arr,
int * __restrict__ f_arr,
int * __restrict__ fz_arr, \
133 float ** __restrict__ a_data_ptr,
int* n_atoms_ptr, \
134 int* K1_ptr,
int* K2_ptr,
int* K3_ptr
137 int patchIndex = blockIdx.y;
138 int K1 = K1_ptr[patchIndex];
139 int K2 = K2_ptr[patchIndex];
140 int K3 = K3_ptr[patchIndex];
141 int n_atoms = n_atoms_ptr[patchIndex];
148 if ( aidx + atoms_per_warp > n_atoms ) {
149 atoms_this_warp = n_atoms - aidx;
150 if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
152 if (
THREAD < atoms_this_warp*7 ) {
153 float* a_data = a_data_ptr[patchIndex];
157 for (
int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
167 for (
int i=1; i<
order; ++i ) {
168 f = af * f + bspline_coeffs.a2d[i][
THREAD %
order];
176 int gti = ti + ai;
if ( gti >= K1 ) gti -= K1;
177 int gtj = tj + aj;
if ( gtj >= K2 ) gtj -= K2;
178 f_arr[gti * K2 + gtj] = 1;
181 int gtk = ak;
if ( gtk >= K3 ) gtk -= K3;
188 int ti = i/(order*
order);
189 int tj = (i/
order)%order;
191 float val = aq *
FX[ti] *
FY[tj] *
FZ[tk];
193 int gti = ti + ai;
if ( gti >= K1 ) gti -= K1;
194 int gtj = tj + aj;
if ( gtj >= K2 ) gtj -= K2;
195 int gtk = ak;
if ( gtk >= K3 ) gtk -= K3;
197 float *dest = q_arr[gti * K2 + gtj];
198 if ( dest ) atomicAdd(dest+gtk,val);
206 const float * __restrict__ coeffs, \
207 float *
const * __restrict__ q_arr,
int * __restrict__ f_arr,
int * __restrict__ fz_arr, \
208 float * __restrict__ a_data,
int n_atoms, \
209 int K1,
int K2,
int K3
216 if ( aidx + atoms_per_warp > n_atoms ) {
217 atoms_this_warp = n_atoms - aidx;
218 if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
220 if (
THREAD < atoms_this_warp*7 ) {
224 for (
int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
233 AQ < -30.f ||
AQ > 30.f ||
AX < 0.f ||
AX > 1.f ||
234 AY < 0.f ||
AY > 1.f ||
AZ < 0.f ||
AZ > 1.f )
237 printf(
"%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms,
AI,
AJ,
AK,
AQ,
AX,
AY,
AZ );
244 for (
int i=1; i<
order; ++i ) {
245 f = af * f + bspline_coeffs.a2d[i][
THREAD %
order];
253 int gti = ti + ai;
if ( gti >= K1 ) gti -= K1;
254 int gtj = tj + aj;
if ( gtj >= K2 ) gtj -= K2;
256 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
258 printf(
"%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj,
AI,
AJ,
AK,
AQ,
AX,
AY,
AZ);
261 f_arr[gti * K2 + gtj] = 1;
264 int gtk = ak;
if ( gtk >= K3 ) gtk -= K3;
267 if ( gtk < 0 || gtk >= K3+order-1 ) {
269 printf(
"%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms,
THREAD, gtk,
AI,
AJ,
AK,
AQ,
AX,
AY,
AZ);
275 int ti = i/(order*
order);
276 int tj = (i/
order)%order;
278 float val = aq *
FX[ti] *
FY[tj] *
FZ[tk];
279 int gti = ti + ai;
if ( gti >= K1 ) gti -= K1;
280 int gtj = tj + aj;
if ( gtj >= K2 ) gtj -= K2;
281 int gtk = ak;
if ( gtk >= K3 ) gtk -= K3;
283 float *dest = q_arr[gti * K2 + gtj];
285 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
287 printf(
"%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk,
AI,
AJ,
AK, dest,
AQ,
AX,
AY,
AZ);
290 if ( dest ) atomicAdd(dest+gtk,val);
297 const float * __restrict__ coeffs, \
298 float *
const * __restrict__ q_arr, \
299 float *
const * __restrict__ afn_g, \
302 int K1,
int K2,
int K3
304 __shared__
float *afn_s[3];
305 if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
307 __shared__
volatile union {
312 __shared__
volatile union {
318 const int n_atoms = afn_s[2] - afn_s[1];
320 if ( aidx + atoms_per_warp > n_atoms ) {
321 atoms_this_warp = n_atoms - aidx;
322 if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
324 if (
THREAD < atoms_this_warp*7 ) {
329 for (
int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
339 for (
int i=1; i<
order; ++i ) {
340 df = af * df - (order-i) * c;
346 __threadfence_block();
353 int ti = i/(order*
order);
354 int tj = (i/
order)%order;
361 const float fx = fx2.
x;
362 const float fy = fy2.
x;
363 const float fz = fz2.
x;
364 const float dfx = fx2.
y;
365 const float dfy = fy2.
y;
366 const float dfz = fz2.
y;
368 float dx = K1 * aq * dfx * fy * fz;
369 float dy = K2 * aq * fx * dfy * fz;
370 float dz = K3 * aq * fx * fy * dfz;
372 int gti = ti + ai;
if ( gti >= K1 ) gti -= K1;
373 int gtj = tj + aj;
if ( gtj >= K2 ) gtj -= K2;
374 int gtk = ak;
if ( gtk >= K3 ) gtk -= K3;
377 const float * __restrict__ src = q_arr[gti * K2 + gtj];
378 float pot = src ? src[gtk] : __int_as_float(0x7fffffff);
383 force_reduction[
WARP].a2d[
THREAD][0] = force_x;
384 force_reduction[
WARP].a2d[
THREAD][1] = force_y;
385 force_reduction[
WARP].a2d[
THREAD][2] = force_z;
405 if (
THREAD < atoms_this_warp*3 ) {
413 if ( ! nblocks )
return;
414 #define CALL(ORDER) if ( order == ORDER ) \
415 cuda_pme_charges_dev<ORDER><<<nblocks,WARPSIZE*warps_per_block,0,stream>>>( \
416 coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
421 else NAMD_die(
"unsupported PMEInterpOrder");
429 gridSize.x = nblocksX;
432 #define CALL(ORDER) if ( order == ORDER ) \
433 cuda_pme_charges_batched_dev<ORDER><<<gridSize,WARPSIZE*warps_per_block,0,stream>>>( \
434 coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
438 else NAMD_die(
"unsupported PMEInterpOrder");
443 #if CUDA_VERSION < 4020
445 #define cudaFuncSetSharedMemConfig(X,Y) dummy()
451 if ( (! nblocks) || (! dimy) )
return;
453 #define CALL(ORDER) if ( order == ORDER ) \
454 cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
455 cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),WARPSIZE*warps_per_block,0,stream>>>( \
456 coeffs, q_arr, afn, K1, K2, K3)
461 else NAMD_die(
"unsupported PMEInterpOrder");
CUDA_PME_CHARGES_PROTOTYPE
static __thread atom * atoms
#define CUDA_PME_FORCES_PROTOTYPE
#define CUDA_PME_CHARGES_BATCHED_PROTOTYPE
__global__ void cuda_pme_charges_batched_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float **__restrict__ a_data_ptr, int *n_atoms_ptr, int *K1_ptr, int *K2_ptr, int *K3_ptr)
static const float order_8_coeffs[8][8]
__global__ void cuda_pme_forces_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, float *const *__restrict__ afn_g, int K1, int K2, int K3)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void NAMD_die(const char *err_msg)
static const float order_4_coeffs[4][4]
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
static const float order_6_coeffs[6][6]
__global__ void cuda_pme_charges_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float *__restrict__ a_data, int n_atoms, int K1, int K2, int K3)
#define CALL(DOENERGY, DOVIRIAL)
CUDA_PME_CHARGES_PROTOTYPE int nblocks