NAMD
ComputePmeCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #include <cuda.h>
3 
4 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
5 // allow linking, must prevent execution elsewhere
6 __device__ void atomicAdd(float *, float) { }
7 #endif
8 #endif //NAMD_CUDA
9 #ifdef NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #endif //NAMD_HIP
12 
13 #include "HipDefines.h"
14 #include "ComputePmeCUDAKernel.h"
15 #include "CudaUtils.h"
16 
17 void NAMD_die(const char *);
18 
19 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
20 
21 // must run with at least order**2 threads
22 #ifdef NAMD_HIP
23 #define warps_per_block 4
24 #define atoms_per_warp 4
25 #define atoms_per_block (atoms_per_warp * warps_per_block)
26 #else // NAMD_CUDA
27 #define warps_per_block 8
28 #define atoms_per_warp 4
29 #define atoms_per_block (atoms_per_warp * warps_per_block)
30 #endif
31 
32 
33 static const float order_4_coeffs[4][4] =
34 {{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}};
35 // divide by 3! = 6
36 
37 static const float order_6_coeffs[6][6] =
38 {{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0,
39  20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1,
40  26, 66, 26, 1}};
41 // divide by 5! = 120
42 
43 static const float order_8_coeffs[8][8] =
44 {{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42,
45  7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
46  0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504,
47  315, -1680, 315, 504, 21}, {0, 7, 392, 1715,
48  0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}};
49 // divide by 7! = 5040
50 
51 void cuda_init_bspline_coeffs(float **c, float **dc, int order) {
52  float *coeffs = new float[order*order];
53  float *dcoeffs = new float[order*order];
54  double divisor;
55  static const float *scoeffs;
56  switch ( order ) {
57  case 4:
58  scoeffs = &order_4_coeffs[0][0];
59  divisor = 6;
60  break;
61  case 6:
62  scoeffs = &order_6_coeffs[0][0];
63  divisor = 120;
64  break;
65  case 8:
66  scoeffs = &order_8_coeffs[0][0];
67  divisor = 5040;
68  break;
69  default:
70  NAMD_die("unsupported PMEInterpOrder");
71  }
72  double sum = 0;
73  for ( int i=0, p=order-1; i<order; ++i,--p ) {
74  for ( int j=0; j<order; ++j ) {
75  double c = scoeffs[i*order+(order-1-j)]; // reverse order
76  sum += c;
77  c /= divisor;
78  coeffs[i*order+j] = c;
79  dcoeffs[i*order+j] = (double)p * c;
80  // printf("%d %d %f %f\n", i, j, c, (double)p*c);
81  }
82  }
83  // printf("checksum: %f %f\n", sum, divisor);
84  if ( sum != divisor )
85  NAMD_die("cuda_init_bspline_coeffs static data checksum error");
86  cudaMalloc((void**) c, order*order*sizeof(float));
87  cudaMalloc((void**) dc, order*order*sizeof(float));
88  cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
89  cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
90  delete [] coeffs;
91  delete [] dcoeffs;
92 }
93 
94 #define BSPLINE_DEFS \
95  __shared__ union { \
96  float a2d[order][order]; \
97  float a1d[order*order]; \
98  } bspline_coeffs; \
99  __shared__ volatile union { \
100  float a2d[atoms_per_warp][7]; \
101  float a1d[atoms_per_warp*7]; \
102  } atoms[warps_per_block]; \
103  if ( threadIdx.x < order*order ) { \
104  bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
105  } \
106  BLOCK_SYNC;
107 
108 // simplify warp-synchronous operations
109 #ifdef NAMD_HIP
110 
111 #define WARP (threadIdx.x>>6)
112 #define THREAD (threadIdx.x&63)
113 #define FX bspline_factors[threadIdx.x>>6][0]
114 #define FY bspline_factors[threadIdx.x>>6][1]
115 #define FZ bspline_factors[threadIdx.x>>6][2]
116 #define DFX bspline_dfactors[threadIdx.x>>6][0]
117 #define DFY bspline_dfactors[threadIdx.x>>6][1]
118 #define DFZ bspline_dfactors[threadIdx.x>>6][2]
119 
120 #define FX2 bspline_2factors[threadIdx.x>>6][0]
121 #define FY2 bspline_2factors[threadIdx.x>>6][1]
122 #define FZ2 bspline_2factors[threadIdx.x>>6][2]
123 
124 #define ATOM atoms[threadIdx.x>>6].a2d[i_atom]
125 #define AX atoms[threadIdx.x>>6].a2d[i_atom][0]
126 #define AY atoms[threadIdx.x>>6].a2d[i_atom][1]
127 #define AZ atoms[threadIdx.x>>6].a2d[i_atom][2]
128 #define AQ atoms[threadIdx.x>>6].a2d[i_atom][3]
129 #define AI atoms[threadIdx.x>>6].a2d[i_atom][4]
130 #define AJ atoms[threadIdx.x>>6].a2d[i_atom][5]
131 #define AK atoms[threadIdx.x>>6].a2d[i_atom][6]
132 
133 #else //NAMD_CUDA
134 #define WARP (threadIdx.x>>5)
135 #define THREAD (threadIdx.x&31)
136 #define FX bspline_factors[threadIdx.x>>5][0]
137 #define FY bspline_factors[threadIdx.x>>5][1]
138 #define FZ bspline_factors[threadIdx.x>>5][2]
139 #define DFX bspline_dfactors[threadIdx.x>>5][0]
140 #define DFY bspline_dfactors[threadIdx.x>>5][1]
141 #define DFZ bspline_dfactors[threadIdx.x>>5][2]
142 
143 #define FX2 bspline_2factors[threadIdx.x>>5][0]
144 #define FY2 bspline_2factors[threadIdx.x>>5][1]
145 #define FZ2 bspline_2factors[threadIdx.x>>5][2]
146 
147 #define ATOM atoms[threadIdx.x>>5].a2d[i_atom]
148 #define AX atoms[threadIdx.x>>5].a2d[i_atom][0]
149 #define AY atoms[threadIdx.x>>5].a2d[i_atom][1]
150 #define AZ atoms[threadIdx.x>>5].a2d[i_atom][2]
151 #define AQ atoms[threadIdx.x>>5].a2d[i_atom][3]
152 #define AI atoms[threadIdx.x>>5].a2d[i_atom][4]
153 #define AJ atoms[threadIdx.x>>5].a2d[i_atom][5]
154 #define AK atoms[threadIdx.x>>5].a2d[i_atom][6]
155 #endif
156 
157 
158 template <int order>
159 __global__ void cuda_pme_charges_batched_dev(
160  const float * __restrict__ coeffs, \
161  float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
162  float ** __restrict__ a_data_ptr, int* n_atoms_ptr, \
163  int* K1_ptr, int* K2_ptr, int* K3_ptr
164 ) {
165 
166  int patchIndex = blockIdx.y;
167  int K1 = K1_ptr[patchIndex];
168  int K2 = K2_ptr[patchIndex];
169  int K3 = K3_ptr[patchIndex];
170  int n_atoms = n_atoms_ptr[patchIndex];
171 
172  BSPLINE_DEFS
173  __shared__ volatile float bspline_factors[warps_per_block][3][order];
174  int atoms_this_warp = atoms_per_warp;
175  {
176  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
177  if ( aidx + atoms_per_warp > n_atoms ) {
178  atoms_this_warp = n_atoms - aidx; // may be negative
179  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
180  }
181  if ( THREAD < atoms_this_warp*7 ) {
182  float* a_data = a_data_ptr[patchIndex];
183  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
184  }
185  }
186  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
187 #if defined(NAMD_HIP)
188  NAMD_WARP_SYNC(WARP_FULL_MASK);
189 #else
190  WARP_SYNC(WARP_FULL_MASK);
191 #endif
192  float aq=AQ;
193  int ai=(int)AI;
194  int aj=(int)AJ;
195  int ak=(int)AK;
196 
197  if ( THREAD < 3 * order ) {
198  const float af = ATOM[THREAD/order]; // x, y, or z
199  float f = bspline_coeffs.a2d[0][THREAD % order];
200  for ( int i=1; i<order; ++i ) {
201  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
202  }
203  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
204  }
205 #if defined(NAMD_HIP)
206  NAMD_WARP_SYNC(WARP_FULL_MASK);
207 #else
208  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
209 #endif
210  for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
211  int ti = i/order;
212  int tj = i%order;
213  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
214  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
215  f_arr[gti * K2 + gtj] = 1;
216  }
217  if ( THREAD < order ) {
218  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
219  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
220 
221  fz_arr[gtk] = 1;
222  }
223 
224  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
225  int ti = i/(order*order);
226  int tj = (i/order)%order;
227  int tk = i%order;
228  float val = aq * FX[ti] * FY[tj] * FZ[tk];
229 
230  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
231  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
232  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
233  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
234  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
235  if ( dest ) atomicAdd(dest+gtk,val);
236  }
237  } // i_atom
238 }
239 
240 
241 template <int order>
242 __global__ void cuda_pme_charges_dev(
243  const float * __restrict__ coeffs, \
244  float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
245  float * __restrict__ a_data, int n_atoms, \
246  int K1, int K2, int K3
247 ) {
248  BSPLINE_DEFS
249  __shared__ volatile float bspline_factors[warps_per_block][3][order];
250  int atoms_this_warp = atoms_per_warp;
251  {
252  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
253  if ( aidx + atoms_per_warp > n_atoms ) {
254  atoms_this_warp = n_atoms - aidx; // may be negative
255  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
256  }
257  if ( THREAD < atoms_this_warp*7 ) {
258  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
259  }
260  }
261  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
262 #if defined(NAMD_HIP)
263  NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
264 #else
265  WARP_SYNC(WARP_FULL_MASK);
266 #endif
267  float aq=AQ;
268  int ai=(int)AI;
269  int aj=(int)AJ;
270  int ak=(int)AK;
271 #if 0
272 if ( THREAD == 0 && (
273  AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
274  AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
275  AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
276 ) {
277  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
278  printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
279  continue;
280 }
281 #endif
282  if ( THREAD < 3 * order ) {
283  const float af = ATOM[THREAD/order]; // x, y, or z
284  float f = bspline_coeffs.a2d[0][THREAD % order];
285  for ( int i=1; i<order; ++i ) {
286  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
287  }
288  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
289  }
290 #if defined(NAMD_HIP)
291  NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
292 #else
293  WARP_SYNC(WARP_FULL_MASK);
294 #endif
295  for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
296  int ti = i/order;
297  int tj = i%order;
298  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
299  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
300 #if 0
301 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
302  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
303  printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
304 } else
305 #endif
306  f_arr[gti * K2 + gtj] = 1;
307  }
308  if ( THREAD < order ) {
309  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
310  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
311 #if 0
312 if ( gtk < 0 || gtk >= K3+order-1 ) {
313  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
314  printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
315 } else
316 #endif
317  fz_arr[gtk] = 1;
318  }
319  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
320  int ti = i/(order*order);
321  int tj = (i/order)%order;
322  int tk = i%order;
323  float val = aq * FX[ti] * FY[tj] * FZ[tk];
324  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
325  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
326  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
327  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
328  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
329 #if 0
330 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
331  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
332  printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
333 } else
334 #endif
335  if ( dest ) atomicAdd(dest+gtk,val);
336  }
337  } // i_atom
338 }
339 
340 template <int order>
341 __global__ void cuda_pme_forces_dev(
342  const float * __restrict__ coeffs, \
343  float * const * __restrict__ q_arr, \
344  float * const * __restrict__ afn_g, \
345  /* float * __restrict__ a_data,
346  float * __restrict__ f_data, int n_atoms, */
347  int K1, int K2, int K3
348 ) {
349  __shared__ float *afn_s[3];
350  if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
351  BSPLINE_DEFS
352  __shared__ volatile union {
353  float a2d[atoms_per_warp][3];
354  float a1d[atoms_per_warp*3];
355  } force_output[warps_per_block];
356  __shared__ float2 bspline_2factors[warps_per_block][3][order];
357  __shared__ volatile union {
358  float a2d[WARPSIZE][3];
359  float a1d[WARPSIZE*3];
360  } force_reduction[warps_per_block];
361  int atoms_this_warp = atoms_per_warp;
362  {
363  const int n_atoms = afn_s[2] - afn_s[1];
364  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
365  if ( aidx + atoms_per_warp > n_atoms ) {
366  atoms_this_warp = n_atoms - aidx; // may be negative
367  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
368  }
369  if ( THREAD < atoms_this_warp*7 ) {
370  atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
371  }
372 #if defined(NAMD_HIP)
373  // NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing atoms
374  __syncthreads();
375 #else
376  WARP_SYNC(WARP_FULL_MASK); // done writing atoms
377 #endif
378  }
379  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
380  float aq=AQ;
381  int ai=(int)AI;
382  int aj=(int)AJ;
383  int ak=(int)AK;
384  if ( THREAD < 3 * order ) {
385  const float af = ATOM[THREAD/order]; // x, y, or z
386  float df = 0.f;
387  float c = bspline_coeffs.a2d[0][THREAD % order];
388  float f = c;
389  for ( int i=1; i<order; ++i ) {
390  df = af * df - (order-i) * c;
391  c = bspline_coeffs.a2d[i][THREAD % order];
392  f = af * f + c;
393  }
394  bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
395  }
396  __threadfence_block(); // bspline_2factors not declared volatile
397 #if defined(NAMD_HIP)
398  // NAMD_WARP_SYNC(WARP_FULL_MASK);
399  __syncthreads();
400 #else
401  WARP_SYNC(WARP_FULL_MASK);
402 #endif
403  float force_x = 0.f;
404  float force_y = 0.f;
405  float force_z = 0.f;
406 
407  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
408  int ti = i/(order*order);
409  int tj = (i/order)%order;
410  int tk = i%order;
411 
412  const float2 fx2 = FX2[ti];
413  const float2 fy2 = FY2[tj];
414  const float2 fz2 = FZ2[tk];
415 
416  const float fx = fx2.x;
417  const float fy = fy2.x;
418  const float fz = fz2.x;
419  const float dfx = fx2.y;
420  const float dfy = fy2.y;
421  const float dfz = fz2.y;
422 
423  float dx = K1 * aq * dfx * fy * fz;
424  float dy = K2 * aq * fx * dfy * fz;
425  float dz = K3 * aq * fx * fy * dfz;
426 
427  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
428  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
429  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
430 
431  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
432  const float * __restrict__ src = q_arr[gti * K2 + gtj]; // could load as constant
433  float pot = src ? src[gtk] : __int_as_float(0x7fffffff); // CUDA NaN
434  force_x += dx * pot;
435  force_y += dy * pot;
436  force_z += dz * pot;
437  }
438  force_reduction[WARP].a2d[THREAD][0] = force_x;
439  force_reduction[WARP].a2d[THREAD][1] = force_y;
440  force_reduction[WARP].a2d[THREAD][2] = force_z;
441 #if defined(NAMD_HIP)
442  // NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
443  __syncthreads();
444 #else
445  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
446 #endif
447 
448  if ( THREAD < 24 ) {
449  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
450  + force_reduction[WARP].a1d[THREAD + 48]
451  + force_reduction[WARP].a1d[THREAD + 72]
452 #ifdef NAMD_HIP
453  + force_reduction[WARP].a1d[THREAD + 96]
454  + force_reduction[WARP].a1d[THREAD + 120]
455  + force_reduction[WARP].a1d[THREAD + 144]
456  + force_reduction[WARP].a1d[THREAD + 168]
457 #endif
458  ;
459  }
460 #if defined(NAMD_HIP)
461  // NAMD_WARP_SYNC(WARP_FULL_MASK);
462  __syncthreads();
463 #else
464  WARP_SYNC(WARP_FULL_MASK);
465 #endif
466  if ( THREAD < 6 ) {
467  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
468  + force_reduction[WARP].a1d[THREAD + 12]
469  + force_reduction[WARP].a1d[THREAD + 18];
470  }
471 #if defined(NAMD_HIP)
472  // NAMD_WARP_SYNC(WARP_FULL_MASK);
473  __syncthreads();
474 #else
475  WARP_SYNC(WARP_FULL_MASK);
476 #endif
477  if ( THREAD < 3 ) {
478  force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
479  + force_reduction[WARP].a1d[THREAD + 3];
480  }
481  } // i_atom
482 #if defined(NAMD_HIP)
483  // NAMD_WARP_SYNC(WARP_FULL_MASK);
484  __syncthreads();
485 #else
486  WARP_SYNC(WARP_FULL_MASK);
487 #endif
488  if ( THREAD < atoms_this_warp*3 ) {
489  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
490  afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
491  }
492 }
493 
494 CUDA_PME_CHARGES_PROTOTYPE {
495  int nblocks = (n_atoms + atoms_per_block - 1) / atoms_per_block;
496  if ( ! nblocks ) return;
497 
498 #define CALL(ORDER) if ( order == ORDER ) \
499  cuda_pme_charges_dev<ORDER><<<nblocks,WARPSIZE*warps_per_block,0,stream>>>( \
500  coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
501  CALL(4);
502  else CALL(6);
503  else CALL(8);
504  else NAMD_die("unsupported PMEInterpOrder");
505 #undef CALL
506 }
507 
508 CUDA_PME_CHARGES_BATCHED_PROTOTYPE {
509  int nblocksX = (n_max_atoms + atoms_per_block - 1) / atoms_per_block;
510  if ( (! nblocksX) || (! numPatches) ) return;
511  dim3 gridSize;
512  gridSize.x = nblocksX;
513  gridSize.y = numPatches;
514  gridSize.z = 1;
515 #define CALL(ORDER) if ( order == ORDER ) \
516  cuda_pme_charges_batched_dev<ORDER><<<gridSize,WARPSIZE*warps_per_block,0,stream>>>( \
517  coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
518  CALL(4);
519  else CALL(6);
520  else CALL(8);
521  else NAMD_die("unsupported PMEInterpOrder");
522 #undef CALL
523 }
524 
525 
526 #if CUDA_VERSION < 4020
527 void dummy() { }
528 #define cudaFuncSetSharedMemConfig(X,Y) dummy()
529 #endif
530 
531 
532 CUDA_PME_FORCES_PROTOTYPE {
533  int nblocks = (maxn + atoms_per_block - 1) / atoms_per_block;
534  if ( (! nblocks) || (! dimy) ) return;
535 
536 #define CALL(ORDER) if ( order == ORDER ) \
537  cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
538  cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),WARPSIZE*warps_per_block,0,stream>>>( \
539  coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)
540  CALL(4);
541  else CALL(6);
542  else CALL(8);
543  else NAMD_die("unsupported PMEInterpOrder");
544 
545 #undef CALL
546 }
547 #endif // NAMD_CUDA
548 
549