NAMD
ComputePmeCUDAKernel.cu
Go to the documentation of this file.
1 #include "ComputePmeCUDAKernel.h"
2 #include "CudaUtils.h"
3 
4 void NAMD_die(const char *);
5 
6 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
7 #ifdef NAMD_CUDA
8 #include <cuda.h>
9 
10 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
11 // allow linking, must prevent execution elsewhere
12 __device__ void atomicAdd(float *, float) { }
13 #endif
14 #endif //NAMD_CUDA
15 #ifdef NAMD_HIP
16 #include "HipDefines.h"
17 #include <hip/hip_runtime.h>
18 #endif //NAMD_HIP
19 #if 0
20 #include <stdio.h>
21 #endif
22 
23 // must run with at least order**2 threads
24 #define warps_per_block 8
25 #define atoms_per_warp 4
26 #define atoms_per_block (atoms_per_warp * warps_per_block)
27 
28 
29 static const float order_4_coeffs[4][4] =
30 {{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}};
31 // divide by 3! = 6
32 
33 static const float order_6_coeffs[6][6] =
34 {{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0,
35  20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1,
36  26, 66, 26, 1}};
37 // divide by 5! = 120
38 
39 static const float order_8_coeffs[8][8] =
40 {{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42,
41  7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
42  0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504,
43  315, -1680, 315, 504, 21}, {0, 7, 392, 1715,
44  0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}};
45 // divide by 7! = 5040
46 
47 void cuda_init_bspline_coeffs(float **c, float **dc, int order) {
48  float *coeffs = new float[order*order];
49  float *dcoeffs = new float[order*order];
50  double divisor;
51  static const float *scoeffs;
52  switch ( order ) {
53  case 4:
54  scoeffs = &order_4_coeffs[0][0];
55  divisor = 6;
56  break;
57  case 6:
58  scoeffs = &order_6_coeffs[0][0];
59  divisor = 120;
60  break;
61  case 8:
62  scoeffs = &order_8_coeffs[0][0];
63  divisor = 5040;
64  break;
65  default:
66  NAMD_die("unsupported PMEInterpOrder");
67  }
68  double sum = 0;
69  for ( int i=0, p=order-1; i<order; ++i,--p ) {
70  for ( int j=0; j<order; ++j ) {
71  double c = scoeffs[i*order+(order-1-j)]; // reverse order
72  sum += c;
73  c /= divisor;
74  coeffs[i*order+j] = c;
75  dcoeffs[i*order+j] = (double)p * c;
76  // printf("%d %d %f %f\n", i, j, c, (double)p*c);
77  }
78  }
79  // printf("checksum: %f %f\n", sum, divisor);
80  if ( sum != divisor )
81  NAMD_die("cuda_init_bspline_coeffs static data checksum error");
82  cudaMalloc((void**) c, order*order*sizeof(float));
83  cudaMalloc((void**) dc, order*order*sizeof(float));
84  cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
85  cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
86  delete [] coeffs;
87  delete [] dcoeffs;
88 }
89 
90 #define BSPLINE_DEFS \
91  __shared__ union { \
92  float a2d[order][order]; \
93  float a1d[order*order]; \
94  } bspline_coeffs; \
95  __shared__ volatile union { \
96  float a2d[atoms_per_warp][7]; \
97  float a1d[atoms_per_warp*7]; \
98  } atoms[warps_per_block]; \
99  if ( threadIdx.x < order*order ) { \
100  bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
101  } \
102  BLOCK_SYNC;
103 
104 // simplify warp-synchronous operations
105 #define WARP (threadIdx.x>>5)
106 #define THREAD (threadIdx.x&31)
107 #define FX bspline_factors[threadIdx.x>>5][0]
108 #define FY bspline_factors[threadIdx.x>>5][1]
109 #define FZ bspline_factors[threadIdx.x>>5][2]
110 #define DFX bspline_dfactors[threadIdx.x>>5][0]
111 #define DFY bspline_dfactors[threadIdx.x>>5][1]
112 #define DFZ bspline_dfactors[threadIdx.x>>5][2]
113 
114 #define FX2 bspline_2factors[threadIdx.x>>5][0]
115 #define FY2 bspline_2factors[threadIdx.x>>5][1]
116 #define FZ2 bspline_2factors[threadIdx.x>>5][2]
117 
118 #define ATOM atoms[threadIdx.x>>5].a2d[i_atom]
119 #define AX atoms[threadIdx.x>>5].a2d[i_atom][0]
120 #define AY atoms[threadIdx.x>>5].a2d[i_atom][1]
121 #define AZ atoms[threadIdx.x>>5].a2d[i_atom][2]
122 #define AQ atoms[threadIdx.x>>5].a2d[i_atom][3]
123 #define AI atoms[threadIdx.x>>5].a2d[i_atom][4]
124 #define AJ atoms[threadIdx.x>>5].a2d[i_atom][5]
125 #define AK atoms[threadIdx.x>>5].a2d[i_atom][6]
126 
127 
128 
129 template <int order>
131  const float * __restrict__ coeffs, \
132  float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
133  float ** __restrict__ a_data_ptr, int* n_atoms_ptr, \
134  int* K1_ptr, int* K2_ptr, int* K3_ptr
135 ) {
136 
137  int patchIndex = blockIdx.y;
138  int K1 = K1_ptr[patchIndex];
139  int K2 = K2_ptr[patchIndex];
140  int K3 = K3_ptr[patchIndex];
141  int n_atoms = n_atoms_ptr[patchIndex];
142 
144  __shared__ volatile float bspline_factors[warps_per_block][3][order];
145  int atoms_this_warp = atoms_per_warp;
146  {
147  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
148  if ( aidx + atoms_per_warp > n_atoms ) {
149  atoms_this_warp = n_atoms - aidx; // may be negative
150  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
151  }
152  if ( THREAD < atoms_this_warp*7 ) {
153  float* a_data = a_data_ptr[patchIndex];
154  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
155  }
156  }
157  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
158  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
159  float aq=AQ;
160  int ai=(int)AI;
161  int aj=(int)AJ;
162  int ak=(int)AK;
163 
164  if ( THREAD < 3 * order ) {
165  const float af = ATOM[THREAD/order]; // x, y, or z
166  float f = bspline_coeffs.a2d[0][THREAD % order];
167  for ( int i=1; i<order; ++i ) {
168  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
169  }
170  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
171  }
172  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
173  for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
174  int ti = i/order;
175  int tj = i%order;
176  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
177  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
178  f_arr[gti * K2 + gtj] = 1;
179  }
180  if ( THREAD < order ) {
181  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
182  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
183 
184  fz_arr[gtk] = 1;
185  }
186 
187  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
188  int ti = i/(order*order);
189  int tj = (i/order)%order;
190  int tk = i%order;
191  float val = aq * FX[ti] * FY[tj] * FZ[tk];
192 
193  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
194  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
195  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
196  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
197  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
198  if ( dest ) atomicAdd(dest+gtk,val);
199  }
200  } // i_atom
201 }
202 
203 
204 template <int order>
205 __global__ void cuda_pme_charges_dev(
206  const float * __restrict__ coeffs, \
207  float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
208  float * __restrict__ a_data, int n_atoms, \
209  int K1, int K2, int K3
210 ) {
212  __shared__ volatile float bspline_factors[warps_per_block][3][order];
213  int atoms_this_warp = atoms_per_warp;
214  {
215  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
216  if ( aidx + atoms_per_warp > n_atoms ) {
217  atoms_this_warp = n_atoms - aidx; // may be negative
218  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
219  }
220  if ( THREAD < atoms_this_warp*7 ) {
221  atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
222  }
223  }
224  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
225  WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
226  float aq=AQ;
227  int ai=(int)AI;
228  int aj=(int)AJ;
229  int ak=(int)AK;
230 #if 0
231 if ( THREAD == 0 && (
232  AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
233  AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
234  AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
235 ) {
236  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
237  printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
238  continue;
239 }
240 #endif
241  if ( THREAD < 3 * order ) {
242  const float af = ATOM[THREAD/order]; // x, y, or z
243  float f = bspline_coeffs.a2d[0][THREAD % order];
244  for ( int i=1; i<order; ++i ) {
245  f = af * f + bspline_coeffs.a2d[i][THREAD % order];
246  }
247  bspline_factors[WARP][THREAD/order][THREAD % order] = f;
248  }
249  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
250  for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
251  int ti = i/order;
252  int tj = i%order;
253  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
254  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
255 #if 0
256 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
257  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
258  printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
259 } else
260 #endif
261  f_arr[gti * K2 + gtj] = 1;
262  }
263  if ( THREAD < order ) {
264  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
265  gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
266 #if 0
267 if ( gtk < 0 || gtk >= K3+order-1 ) {
268  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
269  printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
270 } else
271 #endif
272  fz_arr[gtk] = 1;
273  }
274  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
275  int ti = i/(order*order);
276  int tj = (i/order)%order;
277  int tk = i%order;
278  float val = aq * FX[ti] * FY[tj] * FZ[tk];
279  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
280  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
281  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
282  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
283  float *dest = q_arr[gti * K2 + gtj]; // could load as constant
284 #if 0
285 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
286  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
287  printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
288 } else
289 #endif
290  if ( dest ) atomicAdd(dest+gtk,val);
291  }
292  } // i_atom
293 }
294 
295 template <int order>
296 __global__ void cuda_pme_forces_dev(
297  const float * __restrict__ coeffs, \
298  float * const * __restrict__ q_arr, \
299  float * const * __restrict__ afn_g, \
300  /* float * __restrict__ a_data,
301  float * __restrict__ f_data, int n_atoms, */
302  int K1, int K2, int K3
303 ) {
304  __shared__ float *afn_s[3];
305  if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
307  __shared__ volatile union {
308  float a2d[atoms_per_warp][3];
309  float a1d[atoms_per_warp*3];
310  } force_output[warps_per_block];
311  __shared__ float2 bspline_2factors[warps_per_block][3][order];
312  __shared__ volatile union {
313  float a2d[WARPSIZE][3];
314  float a1d[WARPSIZE*3];
315  } force_reduction[warps_per_block];
316  int atoms_this_warp = atoms_per_warp;
317  {
318  const int n_atoms = afn_s[2] - afn_s[1];
319  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
320  if ( aidx + atoms_per_warp > n_atoms ) {
321  atoms_this_warp = n_atoms - aidx; // may be negative
322  if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
323  }
324  if ( THREAD < atoms_this_warp*7 ) {
325  atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
326  }
327  WARP_SYNC(WARP_FULL_MASK); // done writing atoms
328  }
329  for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
330  float aq=AQ;
331  int ai=(int)AI;
332  int aj=(int)AJ;
333  int ak=(int)AK;
334  if ( THREAD < 3 * order ) {
335  const float af = ATOM[THREAD/order]; // x, y, or z
336  float df = 0.f;
337  float c = bspline_coeffs.a2d[0][THREAD % order];
338  float f = c;
339  for ( int i=1; i<order; ++i ) {
340  df = af * df - (order-i) * c;
341  c = bspline_coeffs.a2d[i][THREAD % order];
342  f = af * f + c;
343  }
344  bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
345  }
346  __threadfence_block(); // bspline_2factors not declared volatile
347  WARP_SYNC(WARP_FULL_MASK); // done writing bspline_2factors (FX2,FY2,FZ2)
348  float force_x = 0.f;
349  float force_y = 0.f;
350  float force_z = 0.f;
351 
352  for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
353  int ti = i/(order*order);
354  int tj = (i/order)%order;
355  int tk = i%order;
356 
357  const float2 fx2 = FX2[ti];
358  const float2 fy2 = FY2[tj];
359  const float2 fz2 = FZ2[tk];
360 
361  const float fx = fx2.x;
362  const float fy = fy2.x;
363  const float fz = fz2.x;
364  const float dfx = fx2.y;
365  const float dfy = fy2.y;
366  const float dfz = fz2.y;
367 
368  float dx = K1 * aq * dfx * fy * fz;
369  float dy = K2 * aq * fx * dfy * fz;
370  float dz = K3 * aq * fx * fy * dfz;
371 
372  int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
373  int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
374  int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
375 
376  gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
377  const float * __restrict__ src = q_arr[gti * K2 + gtj]; // could load as constant
378  float pot = src ? src[gtk] : __int_as_float(0x7fffffff); // CUDA NaN
379  force_x += dx * pot;
380  force_y += dy * pot;
381  force_z += dz * pot;
382  }
383  force_reduction[WARP].a2d[THREAD][0] = force_x;
384  force_reduction[WARP].a2d[THREAD][1] = force_y;
385  force_reduction[WARP].a2d[THREAD][2] = force_z;
386  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
387  if ( THREAD < 24 ) {
388  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
389  + force_reduction[WARP].a1d[THREAD + 48]
390  + force_reduction[WARP].a1d[THREAD + 72];
391  }
392  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
393  if ( THREAD < 6 ) {
394  force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
395  + force_reduction[WARP].a1d[THREAD + 12]
396  + force_reduction[WARP].a1d[THREAD + 18];
397  }
398  WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
399  if ( THREAD < 3 ) {
400  force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
401  + force_reduction[WARP].a1d[THREAD + 3];
402  }
403  } // i_atom
404  WARP_SYNC(WARP_FULL_MASK); // done writing force_output
405  if ( THREAD < atoms_this_warp*3 ) {
406  int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
407  afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
408  }
409 }
410 
412  int nblocks = (n_atoms + atoms_per_block - 1) / atoms_per_block;
413  if ( ! nblocks ) return;
414 #define CALL(ORDER) if ( order == ORDER ) \
415  cuda_pme_charges_dev<ORDER><<<nblocks,WARPSIZE*warps_per_block,0,stream>>>( \
416  coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
417 
418  CALL(4);
419  else CALL(6);
420  else CALL(8);
421  else NAMD_die("unsupported PMEInterpOrder");
422 #undef CALL
423 }
424 
426  int nblocksX = (n_max_atoms + atoms_per_block - 1) / atoms_per_block;
427  if ( (! nblocksX) || (! numPatches) ) return;
428  dim3 gridSize;
429  gridSize.x = nblocksX;
430  gridSize.y = numPatches;
431  gridSize.z = 1;
432 #define CALL(ORDER) if ( order == ORDER ) \
433  cuda_pme_charges_batched_dev<ORDER><<<gridSize,WARPSIZE*warps_per_block,0,stream>>>( \
434  coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
435  CALL(4);
436  else CALL(6);
437  else CALL(8);
438  else NAMD_die("unsupported PMEInterpOrder");
439 #undef CALL
440 }
441 
442 
443 #if CUDA_VERSION < 4020
444 void dummy() { }
445 #define cudaFuncSetSharedMemConfig(X,Y) dummy()
446 #endif
447 
448 
450  int nblocks = (maxn + atoms_per_block - 1) / atoms_per_block;
451  if ( (! nblocks) || (! dimy) ) return;
452 
453 #define CALL(ORDER) if ( order == ORDER ) \
454  cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
455  cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),WARPSIZE*warps_per_block,0,stream>>>( \
456  coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)
457 
458  CALL(4);
459  else CALL(6);
460  else CALL(8);
461  else NAMD_die("unsupported PMEInterpOrder");
462 #undef CALL
463 }
464 
465 #endif // NAMD_CUDA
466 
#define ATOM
CUDA_PME_CHARGES_PROTOTYPE
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
#define THREAD
#define warps_per_block
float x
Definition: PmeSolver.C:4
#define atoms_per_block
#define AY
#define FX2
static __thread atom * atoms
#define CUDA_PME_FORCES_PROTOTYPE
#define CUDA_PME_CHARGES_BATCHED_PROTOTYPE
#define WARPSIZE
Definition: CudaUtils.h:10
__global__ void cuda_pme_charges_batched_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float **__restrict__ a_data_ptr, int *n_atoms_ptr, int *K1_ptr, int *K2_ptr, int *K3_ptr)
#define FY2
static const float order_8_coeffs[8][8]
#define WARP
#define order
Definition: PmeRealSpace.C:235
#define FX
float y
Definition: PmeSolver.C:4
#define FZ
dim3 gridSize
__global__ void cuda_pme_forces_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, float *const *__restrict__ afn_g, int K1, int K2, int K3)
#define FZ2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void NAMD_die(const char *err_msg)
Definition: common.C:85
#define BSPLINE_DEFS
void dummy()
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:59
#define AZ
#define AQ
#define AK
#define AI
#define atoms_per_warp
#define AX
#define AJ
static const float order_4_coeffs[4][4]
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
static const float order_6_coeffs[6][6]
__global__ void cuda_pme_charges_dev(const float *__restrict__ coeffs, float *const *__restrict__ q_arr, int *__restrict__ f_arr, int *__restrict__ fz_arr, float *__restrict__ a_data, int n_atoms, int K1, int K2, int K3)
#define CALL(DOENERGY, DOVIRIAL)
#define FY
CUDA_PME_CHARGES_PROTOTYPE int nblocks