4 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
5 // allow linking, must prevent execution elsewhere
6 __device__ void atomicAdd(float *, float) { }
10 #include <hip/hip_runtime.h>
13 #include "HipDefines.h"
14 #include "ComputePmeCUDAKernel.h"
15 #include "CudaUtils.h"
17 void NAMD_die(const char *);
19 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
21 // must run with at least order**2 threads
23 #define warps_per_block 4
24 #define atoms_per_warp 4
25 #define atoms_per_block (atoms_per_warp * warps_per_block)
27 #define warps_per_block 8
28 #define atoms_per_warp 4
29 #define atoms_per_block (atoms_per_warp * warps_per_block)
33 static const float order_4_coeffs[4][4] =
34 {{1, -3, 3, -1}, {0, 3, -6, 3}, {0, 3, 0, -3}, {0, 1, 4, 1}};
37 static const float order_6_coeffs[6][6] =
38 {{1, -5, 10, -10, 5, -1}, {0, 5, -20, 30, -20, 5}, {0, 10, -20, 0,
39 20, -10}, {0, 10, 20, -60, 20, 10}, {0, 5, 50, 0, -50, -5}, {0, 1,
43 static const float order_8_coeffs[8][8] =
44 {{1, -7, 21, -35, 35, -21, 7, -1}, {0, 7, -42, 105, -140, 105, -42,
45 7}, {0, 21, -84, 105, 0, -105, 84, -21}, {0, 35, 0, -315, 560, -315,
46 0, 35}, {0, 35, 280, -665, 0, 665, -280, -35}, {0, 21, 504,
47 315, -1680, 315, 504, 21}, {0, 7, 392, 1715,
48 0, -1715, -392, -7}, {0, 1, 120, 1191, 2416, 1191, 120, 1}};
49 // divide by 7! = 5040
51 void cuda_init_bspline_coeffs(float **c, float **dc, int order) {
52 float *coeffs = new float[order*order];
53 float *dcoeffs = new float[order*order];
55 static const float *scoeffs;
58 scoeffs = &order_4_coeffs[0][0];
62 scoeffs = &order_6_coeffs[0][0];
66 scoeffs = &order_8_coeffs[0][0];
70 NAMD_die("unsupported PMEInterpOrder");
73 for ( int i=0, p=order-1; i<order; ++i,--p ) {
74 for ( int j=0; j<order; ++j ) {
75 double c = scoeffs[i*order+(order-1-j)]; // reverse order
78 coeffs[i*order+j] = c;
79 dcoeffs[i*order+j] = (double)p * c;
80 // printf("%d %d %f %f\n", i, j, c, (double)p*c);
83 // printf("checksum: %f %f\n", sum, divisor);
85 NAMD_die("cuda_init_bspline_coeffs static data checksum error");
86 cudaMalloc((void**) c, order*order*sizeof(float));
87 cudaMalloc((void**) dc, order*order*sizeof(float));
88 cudaMemcpy(*c, coeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
89 cudaMemcpy(*dc, dcoeffs, order*order*sizeof(float), cudaMemcpyHostToDevice);
94 #define BSPLINE_DEFS \
96 float a2d[order][order]; \
97 float a1d[order*order]; \
99 __shared__ volatile union { \
100 float a2d[atoms_per_warp][7]; \
101 float a1d[atoms_per_warp*7]; \
102 } atoms[warps_per_block]; \
103 if ( threadIdx.x < order*order ) { \
104 bspline_coeffs.a1d[threadIdx.x] = coeffs[threadIdx.x]; \
108 // simplify warp-synchronous operations
111 #define WARP (threadIdx.x>>6)
112 #define THREAD (threadIdx.x&63)
113 #define FX bspline_factors[threadIdx.x>>6][0]
114 #define FY bspline_factors[threadIdx.x>>6][1]
115 #define FZ bspline_factors[threadIdx.x>>6][2]
116 #define DFX bspline_dfactors[threadIdx.x>>6][0]
117 #define DFY bspline_dfactors[threadIdx.x>>6][1]
118 #define DFZ bspline_dfactors[threadIdx.x>>6][2]
120 #define FX2 bspline_2factors[threadIdx.x>>6][0]
121 #define FY2 bspline_2factors[threadIdx.x>>6][1]
122 #define FZ2 bspline_2factors[threadIdx.x>>6][2]
124 #define ATOM atoms[threadIdx.x>>6].a2d[i_atom]
125 #define AX atoms[threadIdx.x>>6].a2d[i_atom][0]
126 #define AY atoms[threadIdx.x>>6].a2d[i_atom][1]
127 #define AZ atoms[threadIdx.x>>6].a2d[i_atom][2]
128 #define AQ atoms[threadIdx.x>>6].a2d[i_atom][3]
129 #define AI atoms[threadIdx.x>>6].a2d[i_atom][4]
130 #define AJ atoms[threadIdx.x>>6].a2d[i_atom][5]
131 #define AK atoms[threadIdx.x>>6].a2d[i_atom][6]
134 #define WARP (threadIdx.x>>5)
135 #define THREAD (threadIdx.x&31)
136 #define FX bspline_factors[threadIdx.x>>5][0]
137 #define FY bspline_factors[threadIdx.x>>5][1]
138 #define FZ bspline_factors[threadIdx.x>>5][2]
139 #define DFX bspline_dfactors[threadIdx.x>>5][0]
140 #define DFY bspline_dfactors[threadIdx.x>>5][1]
141 #define DFZ bspline_dfactors[threadIdx.x>>5][2]
143 #define FX2 bspline_2factors[threadIdx.x>>5][0]
144 #define FY2 bspline_2factors[threadIdx.x>>5][1]
145 #define FZ2 bspline_2factors[threadIdx.x>>5][2]
147 #define ATOM atoms[threadIdx.x>>5].a2d[i_atom]
148 #define AX atoms[threadIdx.x>>5].a2d[i_atom][0]
149 #define AY atoms[threadIdx.x>>5].a2d[i_atom][1]
150 #define AZ atoms[threadIdx.x>>5].a2d[i_atom][2]
151 #define AQ atoms[threadIdx.x>>5].a2d[i_atom][3]
152 #define AI atoms[threadIdx.x>>5].a2d[i_atom][4]
153 #define AJ atoms[threadIdx.x>>5].a2d[i_atom][5]
154 #define AK atoms[threadIdx.x>>5].a2d[i_atom][6]
159 __global__ void cuda_pme_charges_batched_dev(
160 const float * __restrict__ coeffs, \
161 float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
162 float ** __restrict__ a_data_ptr, int* n_atoms_ptr, \
163 int* K1_ptr, int* K2_ptr, int* K3_ptr
166 int patchIndex = blockIdx.y;
167 int K1 = K1_ptr[patchIndex];
168 int K2 = K2_ptr[patchIndex];
169 int K3 = K3_ptr[patchIndex];
170 int n_atoms = n_atoms_ptr[patchIndex];
173 __shared__ volatile float bspline_factors[warps_per_block][3][order];
174 int atoms_this_warp = atoms_per_warp;
176 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
177 if ( aidx + atoms_per_warp > n_atoms ) {
178 atoms_this_warp = n_atoms - aidx; // may be negative
179 if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
181 if ( THREAD < atoms_this_warp*7 ) {
182 float* a_data = a_data_ptr[patchIndex];
183 atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
186 for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
187 #if defined(NAMD_HIP)
188 NAMD_WARP_SYNC(WARP_FULL_MASK);
190 WARP_SYNC(WARP_FULL_MASK);
197 if ( THREAD < 3 * order ) {
198 const float af = ATOM[THREAD/order]; // x, y, or z
199 float f = bspline_coeffs.a2d[0][THREAD % order];
200 for ( int i=1; i<order; ++i ) {
201 f = af * f + bspline_coeffs.a2d[i][THREAD % order];
203 bspline_factors[WARP][THREAD/order][THREAD % order] = f;
205 #if defined(NAMD_HIP)
206 NAMD_WARP_SYNC(WARP_FULL_MASK);
208 WARP_SYNC(WARP_FULL_MASK); // done writing bspline_factors
210 for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
213 int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
214 int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
215 f_arr[gti * K2 + gtj] = 1;
217 if ( THREAD < order ) {
218 int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
219 gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
224 for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
225 int ti = i/(order*order);
226 int tj = (i/order)%order;
228 float val = aq * FX[ti] * FY[tj] * FZ[tk];
230 int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
231 int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
232 int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
233 gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
234 float *dest = q_arr[gti * K2 + gtj]; // could load as constant
235 if ( dest ) atomicAdd(dest+gtk,val);
242 __global__ void cuda_pme_charges_dev(
243 const float * __restrict__ coeffs, \
244 float * const * __restrict__ q_arr, int * __restrict__ f_arr, int * __restrict__ fz_arr, \
245 float * __restrict__ a_data, int n_atoms, \
246 int K1, int K2, int K3
249 __shared__ volatile float bspline_factors[warps_per_block][3][order];
250 int atoms_this_warp = atoms_per_warp;
252 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
253 if ( aidx + atoms_per_warp > n_atoms ) {
254 atoms_this_warp = n_atoms - aidx; // may be negative
255 if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
257 if ( THREAD < atoms_this_warp*7 ) {
258 atoms[WARP].a1d[THREAD] = a_data[aidx*7+THREAD];
261 for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
262 #if defined(NAMD_HIP)
263 NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
265 WARP_SYNC(WARP_FULL_MASK);
272 if ( THREAD == 0 && (
273 AI != (int)AI || AJ != (int)AJ || AK != (int)AK ||
274 AQ < -30.f || AQ > 30.f || AX < 0.f || AX > 1.f ||
275 AY < 0.f || AY > 1.f || AZ < 0.f || AZ > 1.f )
277 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
278 printf("%d/%d %f %f %f %f %f %f %f\n", aidx, n_atoms, AI, AJ, AK, AQ, AX, AY, AZ );
282 if ( THREAD < 3 * order ) {
283 const float af = ATOM[THREAD/order]; // x, y, or z
284 float f = bspline_coeffs.a2d[0][THREAD % order];
285 for ( int i=1; i<order; ++i ) {
286 f = af * f + bspline_coeffs.a2d[i][THREAD % order];
288 bspline_factors[WARP][THREAD/order][THREAD % order] = f;
290 #if defined(NAMD_HIP)
291 NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing atoms, reading bspline_factors (FX,FY,FZ)
293 WARP_SYNC(WARP_FULL_MASK);
295 for ( int i=THREAD; i < order*order; i+=WARPSIZE ) {
298 int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
299 int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
301 if ( gti < 0 || gtj < 0 || gti >= K1 || gtj >= K2 ) {
302 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
303 printf("%d/%d %d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, AI, AJ, AK, AQ, AX, AY, AZ);
306 f_arr[gti * K2 + gtj] = 1;
308 if ( THREAD < order ) {
309 int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
310 gtk += THREAD; // padded to increase coalescing (maybe, but reduces re-use)
312 if ( gtk < 0 || gtk >= K3+order-1 ) {
313 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
314 printf("%d/%d %d %d %f %f %f %f %f %f %f\n", aidx, n_atoms, THREAD, gtk, AI, AJ, AK, AQ, AX, AY, AZ);
319 for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
320 int ti = i/(order*order);
321 int tj = (i/order)%order;
323 float val = aq * FX[ti] * FY[tj] * FZ[tk];
324 int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
325 int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
326 int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
327 gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
328 float *dest = q_arr[gti * K2 + gtj]; // could load as constant
330 if ( gti < 0 || gtj < 0 || gtk < 0 || gti >= K1 || gtj >= K2 || gtk >= K3+order-1 || dest == 0 ) {
331 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp + i_atom;
332 printf("%d/%d %d %d %d %d %f %f %f %ld %f %f %f %f\n", aidx, n_atoms, i, gti, gtj, gtk, AI, AJ, AK, dest, AQ, AX, AY, AZ);
335 if ( dest ) atomicAdd(dest+gtk,val);
341 __global__ void cuda_pme_forces_dev(
342 const float * __restrict__ coeffs, \
343 float * const * __restrict__ q_arr, \
344 float * const * __restrict__ afn_g, \
345 /* float * __restrict__ a_data,
346 float * __restrict__ f_data, int n_atoms, */
347 int K1, int K2, int K3
349 __shared__ float *afn_s[3];
350 if ( threadIdx.x < 3 ) afn_s[threadIdx.x] = afn_g[3*blockIdx.y+threadIdx.x];
352 __shared__ volatile union {
353 float a2d[atoms_per_warp][3];
354 float a1d[atoms_per_warp*3];
355 } force_output[warps_per_block];
356 __shared__ float2 bspline_2factors[warps_per_block][3][order];
357 __shared__ volatile union {
358 float a2d[WARPSIZE][3];
359 float a1d[WARPSIZE*3];
360 } force_reduction[warps_per_block];
361 int atoms_this_warp = atoms_per_warp;
363 const int n_atoms = afn_s[2] - afn_s[1];
364 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
365 if ( aidx + atoms_per_warp > n_atoms ) {
366 atoms_this_warp = n_atoms - aidx; // may be negative
367 if ( atoms_this_warp < 0 ) atoms_this_warp = 0;
369 if ( THREAD < atoms_this_warp*7 ) {
370 atoms[WARP].a1d[THREAD] = afn_s[0][aidx*7+THREAD];
372 #if defined(NAMD_HIP)
373 // NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing atoms
376 WARP_SYNC(WARP_FULL_MASK); // done writing atoms
379 for ( int i_atom=0; i_atom < atoms_this_warp; ++i_atom ) {
384 if ( THREAD < 3 * order ) {
385 const float af = ATOM[THREAD/order]; // x, y, or z
387 float c = bspline_coeffs.a2d[0][THREAD % order];
389 for ( int i=1; i<order; ++i ) {
390 df = af * df - (order-i) * c;
391 c = bspline_coeffs.a2d[i][THREAD % order];
394 bspline_2factors[WARP][THREAD/order][THREAD % order] = make_float2(f,df);
396 __threadfence_block(); // bspline_2factors not declared volatile
397 #if defined(NAMD_HIP)
398 // NAMD_WARP_SYNC(WARP_FULL_MASK);
401 WARP_SYNC(WARP_FULL_MASK);
407 for ( int i=THREAD; i < order*order*order; i+=WARPSIZE ) {
408 int ti = i/(order*order);
409 int tj = (i/order)%order;
412 const float2 fx2 = FX2[ti];
413 const float2 fy2 = FY2[tj];
414 const float2 fz2 = FZ2[tk];
416 const float fx = fx2.x;
417 const float fy = fy2.x;
418 const float fz = fz2.x;
419 const float dfx = fx2.y;
420 const float dfy = fy2.y;
421 const float dfz = fz2.y;
423 float dx = K1 * aq * dfx * fy * fz;
424 float dy = K2 * aq * fx * dfy * fz;
425 float dz = K3 * aq * fx * fy * dfz;
427 int gti = ti + ai; if ( gti >= K1 ) gti -= K1;
428 int gtj = tj + aj; if ( gtj >= K2 ) gtj -= K2;
429 int gtk = ak; if ( gtk >= K3 ) gtk -= K3;
431 gtk += tk; // padded to increase coalescing (maybe, but reduces re-use)
432 const float * __restrict__ src = q_arr[gti * K2 + gtj]; // could load as constant
433 float pot = src ? src[gtk] : __int_as_float(0x7fffffff); // CUDA NaN
438 force_reduction[WARP].a2d[THREAD][0] = force_x;
439 force_reduction[WARP].a2d[THREAD][1] = force_y;
440 force_reduction[WARP].a2d[THREAD][2] = force_z;
441 #if defined(NAMD_HIP)
442 // NAMD_WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
445 WARP_SYNC(WARP_FULL_MASK); // done writing force_reduction
449 force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 24]
450 + force_reduction[WARP].a1d[THREAD + 48]
451 + force_reduction[WARP].a1d[THREAD + 72]
453 + force_reduction[WARP].a1d[THREAD + 96]
454 + force_reduction[WARP].a1d[THREAD + 120]
455 + force_reduction[WARP].a1d[THREAD + 144]
456 + force_reduction[WARP].a1d[THREAD + 168]
460 #if defined(NAMD_HIP)
461 // NAMD_WARP_SYNC(WARP_FULL_MASK);
464 WARP_SYNC(WARP_FULL_MASK);
467 force_reduction[WARP].a1d[THREAD] += force_reduction[WARP].a1d[THREAD + 6]
468 + force_reduction[WARP].a1d[THREAD + 12]
469 + force_reduction[WARP].a1d[THREAD + 18];
471 #if defined(NAMD_HIP)
472 // NAMD_WARP_SYNC(WARP_FULL_MASK);
475 WARP_SYNC(WARP_FULL_MASK);
478 force_output[WARP].a2d[i_atom][THREAD] = force_reduction[WARP].a1d[THREAD]
479 + force_reduction[WARP].a1d[THREAD + 3];
482 #if defined(NAMD_HIP)
483 // NAMD_WARP_SYNC(WARP_FULL_MASK);
486 WARP_SYNC(WARP_FULL_MASK);
488 if ( THREAD < atoms_this_warp*3 ) {
489 int aidx = blockIdx.x * atoms_per_block + WARP * atoms_per_warp;
490 afn_s[1][aidx*3+THREAD] = force_output[WARP].a1d[THREAD];
494 CUDA_PME_CHARGES_PROTOTYPE {
495 int nblocks = (n_atoms + atoms_per_block - 1) / atoms_per_block;
496 if ( ! nblocks ) return;
498 #define CALL(ORDER) if ( order == ORDER ) \
499 cuda_pme_charges_dev<ORDER><<<nblocks,WARPSIZE*warps_per_block,0,stream>>>( \
500 coeffs, q_arr, f_arr, fz_arr, a_data, n_atoms, K1, K2, K3)
504 else NAMD_die("unsupported PMEInterpOrder");
508 CUDA_PME_CHARGES_BATCHED_PROTOTYPE {
509 int nblocksX = (n_max_atoms + atoms_per_block - 1) / atoms_per_block;
510 if ( (! nblocksX) || (! numPatches) ) return;
512 gridSize.x = nblocksX;
513 gridSize.y = numPatches;
515 #define CALL(ORDER) if ( order == ORDER ) \
516 cuda_pme_charges_batched_dev<ORDER><<<gridSize,WARPSIZE*warps_per_block,0,stream>>>( \
517 coeffs, q_arr, f_arr, fz_arr, a_data_ptr, n_atoms_ptr, K1_ptr, K2_ptr, K3_ptr)
521 else NAMD_die("unsupported PMEInterpOrder");
526 #if CUDA_VERSION < 4020
528 #define cudaFuncSetSharedMemConfig(X,Y) dummy()
532 CUDA_PME_FORCES_PROTOTYPE {
533 int nblocks = (maxn + atoms_per_block - 1) / atoms_per_block;
534 if ( (! nblocks) || (! dimy) ) return;
536 #define CALL(ORDER) if ( order == ORDER ) \
537 cudaFuncSetSharedMemConfig(cuda_pme_forces_dev<ORDER>,cudaSharedMemBankSizeEightByte), \
538 cuda_pme_forces_dev<ORDER><<<dim3(nblocks,dimy),WARPSIZE*warps_per_block,0,stream>>>( \
539 coeffs, q_arr, afn, /* a_data, f_data, n_atoms, */ K1, K2, K3)
543 else NAMD_die("unsupported PMEInterpOrder");