2 #define _USE_MATH_DEFINES
9 #if __CUDACC_VER_MAJOR__ >= 11
10 #include <cub/cub.cuh>
12 #include <namd_cub/cub.cuh>
17 #include <hip/hip_runtime.h>
18 #include <hip/hip_runtime_api.h>
19 #include <hipcub/hipcub.hpp>
23 #include "HipDefines.h"
24 #include "CudaUtils.h"
25 #include "CudaRecord.h"
26 #include "CudaPmeSolverUtilKernel.h"
28 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
29 // CCELEC is 1/ (4 pi eps ) in AKMA units, conversion from SI
30 // units: CCELEC = e*e*Na / (4*pi*eps*1Kcal*1A)
32 // parameter :: CCELEC=332.0636D0 ! old value of dubious origin
33 // parameter :: CCELEC=331.843D0 ! value from 1986-1987 CRC Handbook
34 // ! of Chemistry and Physics
35 // real(chm_real), parameter :: &
36 // CCELEC_amber = 332.0522173D0, &
37 // CCELEC_charmm = 332.0716D0 , &
38 // CCELEC_discover = 332.054D0 , &
39 // CCELEC_namd = 332.0636D0
40 //const double ccelec = 332.0636;
41 //const double half_ccelec = 0.5*ccelec;
42 //const float ccelec_float = 332.0636f;
45 // Structure into which virials are stored
46 // NOTE: This structure is only used for computing addresses
48 double sforce_dp[27][3];
49 long long int sforce_fp[27][3];
51 // Energies start here ...
55 // Local structure for scalar_sum -function for energy and virial reductions
56 struct RecipVirial_t {
62 __forceinline__ __device__ double expfunc(const double x) {
66 __forceinline__ __device__ float expfunc(const float x) {
71 // Performs scalar sum on data(nfft1, nfft2, nfft3)
72 // T = float or double
73 // T2 = float2 or double2
75 template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
76 __global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3,
77 const int size1, const int size2, const int size3,
78 const T recip1x, const T recip1y, const T recip1z,
79 const T recip2x, const T recip2y, const T recip2z,
80 const T recip3x, const T recip3y, const T recip3z,
81 const T* prefac1, const T* prefac2, const T* prefac3,
82 const T pi_ewald, const T piv_inv,
83 const int k2_00, const int k3_00,
85 double* __restrict__ energy_recip,
86 double* __restrict__ virial) {
87 // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
89 extern __shared__ T sh_prefac[];
91 HIP_DYNAMIC_SHARED( T, sh_prefac)
94 // Create pointers to shared memory
95 T* sh_prefac1 = (T *)&sh_prefac[0];
96 T* sh_prefac2 = (T *)&sh_prefac[nfft1];
97 T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
99 // Calculate start position (k1, k2, k3) for each thread
100 unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
101 int k3 = tid/(size1*size2);
102 tid -= k3*size1*size2;
104 int k1 = tid - k2*size1;
106 // Starting position in data
107 int pos = k1 + (k2 + k3*size2)*size1;
109 // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
113 // Calculate limits w.r.t. global coordinates
114 const int lim2 = size2 + k2_00;
115 const int lim3 = size3 + k3_00;
117 // Calculate increments (k1_inc, k2_inc, k3_inc)
118 int tot_inc = blockDim.x*gridDim.x;
119 const int k3_inc = tot_inc/(size1*size2);
120 tot_inc -= k3_inc*size1*size2;
121 const int k2_inc = tot_inc/size1;
122 const int k1_inc = tot_inc - k2_inc*size1;
124 // Set data[0] = 0 for the global (0,0,0)
125 if (k1 == 0 && k2 == 0 && k3 == 0) {
130 // Increment position
144 pos += k3_inc*size1*size2;
147 // Load prefac data into shared memory
151 sh_prefac1[t] = prefac1[t];
156 sh_prefac2[t] = prefac2[t];
161 sh_prefac3[t] = prefac3[t];
168 double virial0 = 0.0;
169 double virial1 = 0.0;
170 double virial2 = 0.0;
171 double virial3 = 0.0;
172 double virial4 = 0.0;
173 double virial5 = 0.0;
175 // If nfft1 is odd, set nfft1_half to impossible value so that
176 // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )"
177 // is never satisfied
178 const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
179 const int nfft2_half = nfft2/2;
180 const int nfft3_half = nfft3/2;
186 int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
187 int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
195 m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
196 m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
197 m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
200 T msq = m1*m1 + m2*m2 + m3*m3;
201 T msq_inv = ((T)1)/msq;
203 T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
204 T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
205 if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
206 T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
210 if (calc_energy_virial) {
212 T vir = ((T)2)*(pi_ewald + msq_inv);
214 virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
215 virial1 += (double)(fac*vir*m1*m2);
216 virial2 += (double)(fac*vir*m1*m3);
217 virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
218 virial4 += (double)(fac*vir*m2*m3);
219 virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
226 // Increment position
240 pos += k3_inc*size2*size1;
243 // Reduce energy and virial
244 if (calc_energy_virial) {
245 const int tid = threadIdx.x & (warpSize-1);
246 const int base = (threadIdx.x/warpSize);
247 volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
248 // Reduce within warps
249 for (int d=warpSize/2;d >= 1;d /= 2) {
250 energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
251 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
252 virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
253 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
254 virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
255 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
256 virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
257 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
258 virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
259 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
260 virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
261 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
262 virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
263 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
265 // Reduce between warps
266 // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
269 sh_ev[base].energy = energy;
270 sh_ev[base].virial[0] = virial0;
271 sh_ev[base].virial[1] = virial1;
272 sh_ev[base].virial[2] = virial2;
273 sh_ev[base].virial[3] = virial3;
274 sh_ev[base].virial[4] = virial4;
275 sh_ev[base].virial[5] = virial5;
279 energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
280 virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
281 virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
282 virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
283 virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
284 virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
285 virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
286 for (int d=warpSize/2;d >= 1;d /= 2) {
287 energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
288 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
289 virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
290 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
291 virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
292 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
293 virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
294 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
295 virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
296 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
297 virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
298 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
299 virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
300 WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
304 if (threadIdx.x == 0) {
305 atomicAdd(energy_recip, energy*0.5);
312 atomicAdd(&virial[0], virial0);
313 atomicAdd(&virial[1], virial1);
314 atomicAdd(&virial[2], virial2);
315 atomicAdd(&virial[3], virial3);
316 atomicAdd(&virial[4], virial4);
317 atomicAdd(&virial[5], virial5);
324 template <typename T>
325 __forceinline__ __device__ void write_grid(const float val, const int ind,
327 atomicAdd(&data[ind], (T)val);
331 // General version for any order
333 template <typename T, int order>
334 __forceinline__ __device__ void calc_one_theta(const T w, T *theta) {
336 theta[order-1] = ((T)0);
338 theta[0] = ((T)1) - w;
341 for (int k=3;k <= order-1;k++) {
342 T div = ((T)1) / (T)(k-1);
343 theta[k-1] = div*w*theta[k-2];
345 for (int j=1;j <= k-2;j++) {
346 theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
348 theta[0] = div*(((T)1) - w)*theta[0];
351 //--- one more recursion
352 T div = ((T)1) / (T)(order-1);
353 theta[order-1] = div*w*theta[order-2];
355 for (int j=1;j <= order-2;j++) {
356 theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
359 theta[0] = div*(((T)1) - w)*theta[0];
363 // Calculate theta and dtheta for general order bspline
365 template <typename T, typename T3, int order>
366 __forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta) {
368 theta[order-1].x = ((T)0);
369 theta[order-1].y = ((T)0);
370 theta[order-1].z = ((T)0);
374 theta[0].x = ((T)1) - wx;
375 theta[0].y = ((T)1) - wy;
376 theta[0].z = ((T)1) - wz;
379 for (int k=3;k <= order-1;k++) {
380 T div = ((T)1) / (T)(k-1);
381 theta[k-1].x = div*wx*theta[k-2].x;
382 theta[k-1].y = div*wy*theta[k-2].y;
383 theta[k-1].z = div*wz*theta[k-2].z;
385 for (int j=1;j <= k-2;j++) {
386 theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
387 theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
388 theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
390 theta[0].x = div*(((T)1) - wx)*theta[0].x;
391 theta[0].y = div*(((T)1) - wy)*theta[0].y;
392 theta[0].z = div*(((T)1) - wz)*theta[0].z;
395 //--- perform standard b-spline differentiation
396 dtheta[0].x = -theta[0].x;
397 dtheta[0].y = -theta[0].y;
398 dtheta[0].z = -theta[0].z;
400 for (int j=2;j <= order;j++) {
401 dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
402 dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
403 dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
406 //--- one more recursion
407 T div = ((T)1) / (T)(order-1);
408 theta[order-1].x = div*wx*theta[order-2].x;
409 theta[order-1].y = div*wy*theta[order-2].y;
410 theta[order-1].z = div*wz*theta[order-2].z;
412 for (int j=1;j <= order-2;j++) {
413 theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
414 theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
415 theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
418 theta[0].x = div*(((T)1) - wx)*theta[0].x;
419 theta[0].y = div*(((T)1) - wy)*theta[0].y;
420 theta[0].z = div*(((T)1) - wz)*theta[0].z;
424 // Spreads the charge on the grid. Calculates theta and dtheta on the fly
425 // blockDim.x = Number of atoms each block loads
426 // blockDim.y*blockDim.x/order3 = Number of atoms we spread at once
428 template <typename AT, int order, bool periodicY, bool periodicZ>
430 spread_charge_kernel(const float4 *xyzq, const int ncoord,
431 const int nfftx, const int nffty, const int nfftz,
432 const int xsize, const int ysize, const int zsize,
433 const int xdim, const int y00, const int z00,
436 // Shared memory use:
437 // order = 4: 1920 bytes
438 // order = 6: 2688 bytes
439 // order = 8: 3456 bytes
440 __shared__ int sh_ix[32];
441 __shared__ int sh_iy[32];
442 __shared__ int sh_iz[32];
443 __shared__ float sh_thetax[order*32];
444 __shared__ float sh_thetay[order*32];
445 __shared__ float sh_thetaz[order*32];
447 // Process atoms pos to pos_end-1 (blockDim.x)
448 const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
449 const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
451 if (pos < pos_end && threadIdx.y == 0) {
453 float4 xyzqi = xyzq[pos];
459 float frx = ((float)nfftx)*x;
460 float fry = ((float)nffty)*y;
461 float frz = ((float)nfftz)*z;
467 float wx = frx - (float)frxi;
468 float wy = fry - (float)fryi;
469 float wz = frz - (float)frzi;
471 if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
472 if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
474 sh_ix[threadIdx.x] = frxi;
475 sh_iy[threadIdx.x] = fryi - y00;
476 sh_iz[threadIdx.x] = frzi - z00;
480 calc_one_theta<float, order>(wx, theta);
482 for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
484 calc_one_theta<float, order>(wy, theta);
486 for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
488 calc_one_theta<float, order>(wz, theta);
490 for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
496 // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
497 // NOTE: Only tid=0...order*order*order-1 do any computation
498 const int order3 = ((order*order*order-1)/32 + 1)*32;
499 const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
500 const int x0 = tid % order;
501 const int y0 = (tid / order) % order;
502 const int z0 = tid / (order*order);
504 // Loop over atoms pos..pos_end-1
505 int iadd = blockDim.x*blockDim.y/order3;
506 int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
507 int iend = pos_end - blockIdx.x*blockDim.x;
508 for (;i < iend;i += iadd) {
509 int x = sh_ix[i] + x0;
510 int y = sh_iy[i] + y0;
511 int z = sh_iz[i] + z0;
513 if (x >= nfftx) x -= nfftx;
515 if (periodicY && (y >= nffty)) y -= nffty;
516 if (!periodicY && (y < 0 || y >= ysize)) continue;
518 if (periodicZ && (z >= nfftz)) z -= nfftz;
519 if (!periodicZ && (z < 0 || z >= zsize)) continue;
521 // Get position on the grid
522 int ind = x + xdim*(y + ysize*(z));
524 // Here we unroll the 6x6x6 loop with 216 threads.
525 // NOTE: We use 7*32=224 threads to do this
526 // Calculate interpolated charge value and store it to global memory
528 if (tid < order*order*order)
529 write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
535 template <typename AT, int order, bool periodicY, bool periodicZ>
537 spread_charge_kernel_v2(const float4 *xyzq, const int ncoord,
538 const int nfftx, const int nffty, const int nfftz,
539 const float nfftx_f, const float nffty_f,
541 const int xsize, const int ysize, const int zsize,
542 const int xdim, const int y00, const int z00,
545 // Shared memory use:
546 // order = 4: 1920 bytes
547 // order = 6: 2688 bytes
548 // order = 8: 3456 bytes
549 __shared__ int sh_ix[WARPSIZE];
550 __shared__ int sh_iy[WARPSIZE];
551 __shared__ int sh_iz[WARPSIZE];
552 __shared__ float sh_thetax[order*WARPSIZE];
553 __shared__ float sh_thetay[order*WARPSIZE];
554 __shared__ float sh_thetaz[order*WARPSIZE];
556 // Process atoms pos to pos_end-1 (blockDim.x)
557 const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
558 const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
559 // const float invorder = 1.f/order;
561 if (pos < pos_end && threadIdx.y == 0) {
563 float4 xyzqi = xyzq[pos];
569 float frx = (nfftx_f)*x;
570 float fry = (nffty_f)*y;
571 float frz = (nfftz_f)*z;
577 float wx = frx - (float)frxi;
578 float wy = fry - (float)fryi;
579 float wz = frz - (float)frzi;
581 if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
582 if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
584 sh_ix[threadIdx.x] = frxi;
585 sh_iy[threadIdx.x] = fryi - y00;
586 sh_iz[threadIdx.x] = frzi - z00;
590 calc_one_theta<float, order>(wx, theta);
592 for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
594 calc_one_theta<float, order>(wy, theta);
596 for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
598 calc_one_theta<float, order>(wz, theta);
600 for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
606 // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
607 // NOTE: Only tid=0...order*order*order-1 do any computation
608 const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
609 const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
610 const int x0 = tid % order;
611 const int y0 = (tid / order) % order;
612 //const int y0 = (int)(tid * invorder) % order;
613 const int z0 = tid / (order*order);
614 //const int z0 = tid * invorder * invorder;
616 // Loop over atoms pos..pos_end-1
617 int iadd = blockDim.x*blockDim.y/order3;
618 int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
619 int iend = pos_end - blockIdx.x*blockDim.x;
620 for (;i < iend;i += iadd) {
621 int x = sh_ix[i] + x0;
622 int y = sh_iy[i] + y0;
623 int z = sh_iz[i] + z0;
625 if (x >= nfftx) x -= nfftx;
627 if (periodicY && (y >= nffty)) y -= nffty;
628 if (!periodicY && (y < 0 || y >= ysize)) continue;
630 if (periodicZ && (z >= nfftz)) z -= nfftz;
631 if (!periodicZ && (z < 0 || z >= zsize)) continue;
633 // Get position on the grid
634 int ind = x + xdim*(y + ysize*(z));
636 // Here we unroll the 6x6x6 loop with 216 threads.
637 // NOTE: We use 7*32=224 threads to do this
638 // Calculate interpolated charge value and store it to global memory
640 if (tid < order*order*order)
641 write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
647 template<typename AT, int kPatchGridDim>
649 __launch_bounds__(PatchLevelPmeData::kNumThreads, 4)
650 spread_charge_patch_8th(
651 const int numPatches,
652 const int3 patchGridDim,
653 const CudaLocalRecord* localRecords,
654 const int3* patchGridOffsets,
656 const int nfftx, const int nffty, const int nfftz,
657 const float nfftx_f, const float nffty_f,
659 const int xsize, const int ysize, const int zsize,
660 const int xdim, const int y00, const int z00,
663 // Define local copies of constexpr variables
664 constexpr int kNumThreads = PatchLevelPmeData::kNumThreads;
665 constexpr int kThetaStride = kNumThreads + PatchLevelPmeData::kThetaPad;
666 constexpr int kDims = PatchLevelPmeData::kDim;
667 constexpr int kPatchGridDimPad = PatchLevelPmeData::kPatchGridDimPad;
668 constexpr int kOrder = 8;
670 // Setup shared memory buffers
671 extern __shared__ float sharedBuffer[];
672 char4* s_indices = (char4*) sharedBuffer;
673 float* s_theta = (float*) &sharedBuffer[kNumThreads];
674 float* s_grid = (float*) &sharedBuffer[kNumThreads + kOrder * kDims * kThetaStride];
676 // Zero out grid shared memory buffer since we will be accumulating into it
677 const int numGridPoints = kPatchGridDimPad * kPatchGridDim * kPatchGridDim;
678 for (int i = threadIdx.x; i < numGridPoints; i += blockDim.x) {
683 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
684 const int numAtoms = localRecords[patchIndex].numAtoms;
685 const int offset = localRecords[patchIndex].bufferOffset;
686 const int3 patchGridOffset = patchGridOffsets[patchIndex];
688 const int numIters = (numAtoms - 1 + kNumThreads) / kNumThreads;
689 // Iterate over all the atoms in the patch in chunks as we need to stash theta values in
691 for (int iter = blockIdx.y; iter < numIters; iter += gridDim.y) {
693 // Compute thetas with each thread processing a different atom
695 NAMD_WARP_SYNC(WARP_FULL_MASK);
696 const int atomIndex = iter * blockDim.x + threadIdx.x;
697 if (atomIndex < numAtoms) {
698 float4 xyzqi = xyzq[atomIndex + offset];
700 // This will just shift the unscaled positions to be between 0 and 1 instead of
701 // -0.5 and 0.5 without handling the boundary conditions. The boundary conditions
702 // will be handled when the grid is written from shared to global memory
703 float x = xyzqi.x + 0.5f;
704 float y = xyzqi.y + 0.5f;
705 float z = xyzqi.z + 0.5f;
708 float frx = (nfftx_f)*x;
709 float fry = (nffty_f)*y;
710 float frz = (nfftz_f)*z;
712 int frxi = (int)floorf(frx);
713 int fryi = (int)floorf(fry);
714 int frzi = (int)floorf(frz);
716 float wx = frx - (float)frxi;
717 float wy = fry - (float)fryi;
718 float wz = frz - (float)frzi;
720 // Because these are indexes into the shared memory buffer, it should be safe to store
722 s_indices[threadIdx.x].x = (char) (frxi - patchGridOffset.x);
723 s_indices[threadIdx.x].y = (char) (fryi - patchGridOffset.y);
724 s_indices[threadIdx.x].z = (char) (frzi - patchGridOffset.z);
728 calc_one_theta<float, kOrder>(wx, theta);
730 for (int i=0;i < kOrder;i++) {
731 // The x theta matrix has the atom as the fastest moving index and the point as the slowest moving index:
733 // atom 0 point 0, atom 1 point 0, atom 2 point 0, ...
734 // atom 0 point 1, atom 1 point 1, atom 2 point 1, ...
737 const int idx = 0 * (kOrder * kThetaStride) + i * kThetaStride + threadIdx.x;
738 s_theta[idx] = q*theta[i];
741 calc_one_theta<float, kOrder>(wy, theta);
743 for (int i=0;i < kOrder;i++) {
744 // Each thread will process 2 grid points in the Y dimension, and we want that data to be consecutive in
745 // shared memory to allow for vector loads:
747 // atom 0 point 0, atom 0 point 4, atom 1 point 0, atom 1 point 1, ...
748 // atom 0 point 1, atom 0 point 5, atom 1 point 1, atom 1 point 5, ...
751 const int indexInFloat2 = i / 4;
752 const int float2Index = i % 4;
753 const int idx = 1 * (kOrder * kThetaStride) + 2 * float2Index * kThetaStride + 2 * threadIdx.x + indexInFloat2;
754 s_theta[idx] = theta[i];
757 calc_one_theta<float, kOrder>(wz, theta);
759 for (int i=0;i < kOrder;i++) {
760 // The z theta values use the same layout as the y theta values since each thread will
761 // process 2 grid points in the Z dimension
762 const int indexInFloat2 = i / 4;
763 const int float2Index = i % 4;
764 const int idx = 2 * (kOrder * kThetaStride) + 2 * float2Index * kThetaStride + 2 * threadIdx.x + indexInFloat2;
765 s_theta[idx] = theta[i];
771 // Compute the grid contributions and store in shared memory with all threads processing a single atom concurrently
774 // Each thread block will compute a 8x8x8 grid of charges with 128 threads. Each thread will
775 // compute a 1x2x2 non-contiguous sub-grid based on the layout of the theta described above.
776 // For example, thread 0 will compute (0,0,0), (0,4,0), (0,0,4), (0,4,4)
777 const int xOffset = threadIdx.x % kOrder;
778 const int yPartialOffset = (threadIdx.x / kOrder) % (kOrder / 2);
779 const int zPartialOffset = threadIdx.x / (kOrder * (kOrder / 2));
781 constexpr int kYItersPerThread = 2;
782 constexpr int kZItersPerThread = 2;
783 constexpr int kYValPerIter = 4;
784 constexpr int kZValPerIter = 4;
786 const int atomsToProcess = min(kNumThreads, numAtoms - iter * kNumThreads);
787 const int outerIters = ((atomsToProcess) / 4) * 4;
788 int thetaIndexOuter = 0;
790 // The loop over atoms is broken into chunks of 4 to enable loading in multiple atom's x theta data.
791 // Each warp will only need to load in 8 distinct x theta values, so some of the bytes are wasted. By
792 // loading in the x theta data for 4 atoms at once, we can avoid this.
793 for ( ; thetaIndexOuter < outerIters; thetaIndexOuter += 4) {
794 char4 r_global_indices[4];
798 for (int thetaIndexInner = 0; thetaIndexInner < 4; thetaIndexInner++) {
799 const int thetaIndex = thetaIndexOuter + thetaIndexInner;
800 // The compiler should auto vectorize these loads
801 r_global_indices[thetaIndexInner] = s_indices[thetaIndex];
802 r_x_val[thetaIndexInner] = s_theta[0 * (kOrder * kThetaStride) + xOffset * kThetaStride + thetaIndex];
806 for (int thetaIndexInner = 0; thetaIndexInner < 4; thetaIndexInner++) {
807 const int thetaIndex = thetaIndexOuter + thetaIndexInner;
809 const int global_idx_x = r_global_indices[thetaIndexInner].x;
810 const int global_idx_y = r_global_indices[thetaIndexInner].y;
811 const int global_idx_z = r_global_indices[thetaIndexInner].z;
813 const int shared_idx_x = global_idx_x + xOffset;
814 const float x_val = r_x_val[thetaIndexInner];
817 int r_shared_idx_y[kYItersPerThread];
818 float r_grid_val_xy[kYItersPerThread];
820 for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
821 const int yOffset = 2 * yPartialOffset;
822 r_shared_idx_y[yIter] = global_idx_y + yPartialOffset + yIter * kYValPerIter;
823 // The compiler should auto vectorize these loads
824 const float y_val = s_theta[1 * (kOrder * kThetaStride) + yOffset * kThetaStride + 2 * thetaIndex + yIter];
825 r_grid_val_xy[yIter] = x_val * y_val;
828 int r_shared_idx_z[kZItersPerThread];
829 float r_z_val[kZItersPerThread];
831 for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
832 const int zOffset = 2 * zPartialOffset;
833 r_z_val[zIter] = s_theta[2 * (kOrder * kThetaStride) + zOffset * kThetaStride + 2 * thetaIndex + zIter];
834 r_shared_idx_z[zIter] = global_idx_z + zPartialOffset + zIter * kZValPerIter;
838 for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
840 for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
841 const float grid_val = r_grid_val_xy[yIter] * r_z_val[zIter];
842 const int shared_idx = shared_idx_x + kPatchGridDimPad * (r_shared_idx_y[yIter] + r_shared_idx_z[zIter] * kPatchGridDim);
843 s_grid[shared_idx] += grid_val;
850 // Compute for the remaining atoms
851 for (int thetaIndex = thetaIndexOuter ; thetaIndex < atomsToProcess; thetaIndex++) {
852 const int global_idx_x = s_indices[thetaIndex].x;
853 const int global_idx_y = s_indices[thetaIndex].y;
854 const int global_idx_z = s_indices[thetaIndex].z;
856 const int shared_idx_x = global_idx_x + xOffset;
857 const float x_val = s_theta[0 * (kOrder * kThetaStride) + xOffset * kThetaStride + thetaIndex];
859 int r_shared_idx_y[kYItersPerThread];
860 float r_grid_val_xy[kYItersPerThread];
862 for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
863 const int yOffset = 2 * yPartialOffset;
864 r_shared_idx_y[yIter] = global_idx_y + yPartialOffset + yIter * kYValPerIter;
865 const float y_val = s_theta[1 * (kOrder * kThetaStride) + yOffset * kThetaStride + 2 * thetaIndex + yIter];
866 r_grid_val_xy[yIter] = x_val * y_val;
869 int r_shared_idx_z[kZItersPerThread];
870 float r_z_val[kZItersPerThread];
872 for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
873 const int zOffset = 2 * zPartialOffset;
874 r_z_val[zIter] = s_theta[2 * (kOrder * kThetaStride) + zOffset * kThetaStride + 2 * thetaIndex + zIter];
875 r_shared_idx_z[zIter] = global_idx_z + zPartialOffset + zIter * kZValPerIter;
879 for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
881 for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
882 const float grid_val = r_grid_val_xy[yIter] * r_z_val[zIter];
883 const int shared_idx = shared_idx_x + kPatchGridDimPad * (r_shared_idx_y[yIter] + r_shared_idx_z[zIter] * kPatchGridDim);
884 s_grid[shared_idx] += grid_val;
891 // Write Shared grid to global memory
893 const int indexInWarp = threadIdx.x % 32;
894 const int warpIndex = threadIdx.x / 32;
895 const int numWarps = blockDim.x / 32;
896 const bool isActive = (indexInWarp < patchGridDim.x);
897 if (!isActive) continue;
899 const int shared_idx_x = indexInWarp;
900 int global_idx_x = shared_idx_x + patchGridOffset.x;
901 if (global_idx_x >= nfftx) global_idx_x -= nfftx;
902 else if (global_idx_x < 0) global_idx_x += nfftx;
905 for (int yIter = warpIndex; yIter < patchGridDim.y; yIter += numWarps) {
906 const int shared_idx_y = yIter;
907 int global_idx_y = shared_idx_y + patchGridOffset.y;
908 if (global_idx_y >= nffty) global_idx_y -= nffty;
909 else if (global_idx_y < 0) global_idx_y += nffty;
911 for (int zIter = 0; zIter < patchGridDim.z; zIter += 1) {
912 const int shared_idx_z = zIter;
913 int global_idx_z = shared_idx_z + patchGridOffset.z;
914 if (global_idx_z >= nfftz) global_idx_z -= nfftz;
915 else if (global_idx_z < 0) global_idx_z += nfftz;
917 const int shared_idx = shared_idx_x + kPatchGridDimPad * (shared_idx_y + shared_idx_z * kPatchGridDim);
918 const int global_idx = global_idx_x + xdim * (global_idx_y + ysize * global_idx_z);
920 atomicAdd(data + global_idx, s_grid[shared_idx]);
921 s_grid[shared_idx] = 0.0f;
928 //-----------------------------------------------------------------------------------------
929 // Generic version can not be used
930 template <typename T> __forceinline__ __device__
931 void gather_force_store(const float fx, const float fy, const float fz,
932 const int stride, const int pos,
936 // Template specialization for "float"
937 template <> __forceinline__ __device__
938 void gather_force_store<float>(const float fx, const float fy, const float fz,
939 const int stride, const int pos,
941 // Store into non-strided float XYZ array
943 force[pos+stride] = fy;
944 force[pos+stride*2] = fz;
947 // Template specialization for "float3"
948 template <> __forceinline__ __device__
949 void gather_force_store<float3>(const float fx, const float fy, const float fz,
950 const int stride, const int pos,
952 // Store into non-strided "float3" array
954 // Workaround: unlike CUDA, HIP-hcc has sizeof(float3) != sizeof(CudaForce) (and == sizeof(float4))
955 // TODO-HIP: Remove when https://github.com/ROCm-Developer-Tools/HIP/issues/706 is fixed
956 reinterpret_cast<float*>(force)[pos * 3 + 0] = fx;
957 reinterpret_cast<float*>(force)[pos * 3 + 1] = fy;
958 reinterpret_cast<float*>(force)[pos * 3 + 2] = fz;
965 //-----------------------------------------------------------------------------------------
967 // Per atom data structure for the gather_force -kernels
968 template <typename T, int order>
986 // Gathers forces from the grid
987 // blockDim.x = Number of atoms each block loads
988 // blockDim.x*blockDim.y = Total number of threads per block
990 template <typename CT, typename FT, int order, bool periodicY, bool periodicZ>
991 __global__ void gather_force(const float4 *xyzq, const int ncoord,
992 const int nfftx, const int nffty, const int nfftz,
993 const int xsize, const int ysize, const int zsize,
994 const int xdim, const int y00, const int z00,
995 // const float recip1, const float recip2, const float recip3,
996 const float* data, // NOTE: data is used for loads when __CUDA_ARCH__ >= 350
998 const cudaTextureObject_t gridTexObj,
1003 const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
1006 __shared__ gather_t<CT, order> shmem[32];
1008 const int pos = blockIdx.x*blockDim.x + threadIdx.x;
1009 const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
1011 // Load atom data into shared memory
1012 if (pos < pos_end && threadIdx.y == 0) {
1014 float4 xyzqi = xyzq[pos];
1020 float frx = ((float)nfftx)*x;
1021 float fry = ((float)nffty)*y;
1022 float frz = ((float)nfftz)*z;
1024 int frxi = (int)frx;
1025 int fryi = (int)fry;
1026 int frzi = (int)frz;
1028 float wx = frx - (float)frxi;
1029 float wy = fry - (float)fryi;
1030 float wz = frz - (float)frzi;
1032 if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
1033 if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
1035 shmem[threadIdx.x].ix = frxi;
1036 shmem[threadIdx.x].iy = fryi - y00;
1037 shmem[threadIdx.x].iz = frzi - z00;
1038 shmem[threadIdx.x].charge = q;
1040 float3 theta_tmp[order];
1041 float3 dtheta_tmp[order];
1042 calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
1045 for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
1048 for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
1051 for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
1054 for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
1057 for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
1060 for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
1065 // We divide the order x order x order cube into 8 sub-cubes.
1066 // These sub-cubes are taken care by a single thread
1067 // The size of the sub-cubes is:
1071 const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
1072 // Calculate the starting index on the sub-cube for this thread
1074 const int t = (tid % 8); // sub-cube index (0...7)
1075 // t = (tx0 + ty0*2 + tz0*4)/nsc
1076 // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
1077 const int tz0 = (t / 4)*nsc;
1078 const int ty0 = ((t / 2) % 2)*nsc;
1079 const int tx0 = (t % 2)*nsc;
1082 // Calculate forces for 32 atoms. We have 32*2 = 64 threads
1083 // Loop is iterated 4 times:
1085 // Threads 0...7 = atoms 0, 8, 16, 24
1086 // Threads 8...15 = atoms 1, 9, 17, 25
1087 // Threads 16...31 = atoms 2, 10, 18, 26
1089 // Threads 56...63 = atoms 7, 15, 23, 31
1093 const int base_end = pos_end - blockIdx.x*blockDim.x;
1095 // Make sure to mask out any threads that are not running the loop.
1096 // This will happen if the number of atoms is not a multiple of 32.
1097 #if defined(NAMD_HIP)
1098 WarpMask warp_mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
1100 WarpMask warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
1103 while (base < base_end) {
1108 int ix0 = shmem[base].ix;
1109 int iy0 = shmem[base].iy;
1110 int iz0 = shmem[base].iz;
1112 // Each thread calculates a nsc x nsc x nsc sub-cube
1114 for (int i=0;i < nsc*nsc*nsc;i++) {
1115 int tz = tz0 + (i/(nsc*nsc));
1116 int ty = ty0 + ((i/nsc) % nsc);
1117 int tx = tx0 + (i % nsc);
1122 if (ix >= nfftx) ix -= nfftx;
1124 if (periodicY && (iy >= nffty)) iy -= nffty;
1125 if (!periodicY && (iy < 0 || iy >= ysize)) continue;
1127 if (periodicZ && (iz >= nfftz)) iz -= nfftz;
1128 if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
1130 int ind = ix + (iy + iz*ysize)*xdim;
1132 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
1133 float q0 = __ldg(&data[ind]);
1135 float q0 = tex1Dfetch<float>(gridTexObj, ind);
1137 float thx0 = shmem[base].thetax[tx];
1138 float thy0 = shmem[base].thetay[ty];
1139 float thz0 = shmem[base].thetaz[tz];
1140 float dthx0 = shmem[base].dthetax[tx];
1141 float dthy0 = shmem[base].dthetay[ty];
1142 float dthz0 = shmem[base].dthetaz[tz];
1143 f1 += dthx0 * thy0 * thz0 * q0;
1144 f2 += thx0 * dthy0 * thz0 * q0;
1145 f3 += thx0 * thy0 * dthz0 * q0;
1148 //-------------------------
1151 const int i = threadIdx.x & 7;
1153 f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
1154 f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
1155 f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
1157 f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
1158 f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
1159 f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
1161 f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
1162 f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
1163 f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
1166 shmem[base].f1 = f1;
1167 shmem[base].f2 = f2;
1168 shmem[base].f3 = f3;
1172 #if defined(NAMD_HIP)
1173 warp_mask = NAMD_WARP_BALLOT(warp_mask, (base < base_end) );
1175 warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
1181 if (pos < pos_end && threadIdx.y == 0) {
1182 float f1 = shmem[threadIdx.x].f1;
1183 float f2 = shmem[threadIdx.x].f2;
1184 float f3 = shmem[threadIdx.x].f3;
1185 float q = -shmem[threadIdx.x].charge; //*ccelec_float;
1186 // float fx = q*recip1*f1*nfftx;
1187 // float fy = q*recip2*f2*nffty;
1188 // float fz = q*recip3*f3*nfftz;
1189 float fx = q*f1*nfftx;
1190 float fy = q*f2*nffty;
1191 float fz = q*f3*nfftz;
1192 gather_force_store<FT>(fx, fy, fz, stride, pos, force);
1197 template<typename CT, typename FT, int kPatchGridDim>
1199 __launch_bounds__(PatchLevelPmeData::kNumThreads, 1)
1200 gather_force_patch_8th( const int numPatches,
1201 const int3 patchGridDim,
1202 const CudaLocalRecord* localRecords,
1203 const int3* patchGridOffsets,
1205 const int nfftx, const int nffty, const int nfftz,
1206 const int xsize, const int ysize, const int zsize,
1207 const int xdim, const int y00, const int z00,
1212 // Define local copies of constexpr variables
1213 constexpr int kNumThreads = PatchLevelPmeData::kNumThreads;
1214 constexpr int kThetaStride = kNumThreads + PatchLevelPmeData::kThetaPad;
1215 constexpr int kDims = PatchLevelPmeData::kDim;
1216 constexpr int kPatchGridDimPad = PatchLevelPmeData::kPatchGridDim;
1217 constexpr int kOrder = 8;
1219 // Setup shared memory buffers
1220 extern __shared__ float sharedBuffer[];
1221 float2* s_x_theta = (float2*) sharedBuffer;
1222 float* s_grid = (float*) &sharedBuffer[2 * kOrder * kThetaStride];
1224 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1225 const int numAtoms = localRecords[patchIndex].numAtoms;
1226 const int offset = localRecords[patchIndex].bufferOffset;
1227 const int3 patchGridOffset = patchGridOffsets[patchIndex];
1230 // Load grid into shared memory
1232 const int indexInWarp = threadIdx.x % 32;
1233 const int warpIndex = threadIdx.x / 32;
1234 const int numWarps = blockDim.x / 32;
1235 const bool isActive = (indexInWarp < patchGridDim.x);
1237 const int shared_idx_x = indexInWarp;
1238 int global_idx_x = shared_idx_x + patchGridOffset.x;
1239 if (global_idx_x >= nfftx) global_idx_x -= nfftx;
1240 else if (global_idx_x < 0) global_idx_x += nfftx;
1243 for (int yIter = warpIndex; yIter < patchGridDim.y; yIter += numWarps) {
1244 const int shared_idx_y = yIter;
1245 int global_idx_y = shared_idx_y + patchGridOffset.y;
1246 if (global_idx_y >= nffty) global_idx_y -= nffty;
1247 else if (global_idx_y < 0) global_idx_y += nffty;
1249 for (int zIter = 0; zIter < patchGridDim.z; zIter += 1) {
1250 const int shared_idx_z = zIter;
1251 int global_idx_z = shared_idx_z + patchGridOffset.z;
1252 if (global_idx_z >= nfftz) global_idx_z -= nfftz;
1253 else if (global_idx_z < 0) global_idx_z += nfftz;
1255 const int shared_idx = shared_idx_x + kPatchGridDimPad * (shared_idx_y + shared_idx_z * kPatchGridDim);
1256 const int global_idx = global_idx_x + xdim * (global_idx_y + ysize * global_idx_z);
1259 s_grid[shared_idx] = data[global_idx];
1265 // Iterate over the atoms in chunks of size equal to the number of threads. For small system, we can
1266 // assign multiple thread blocks per patch in the Y dimension to increase the amount of parallelism
1268 // Each thread will compute the forces on one atom; this simplifies the kernel since it doesn't need
1269 // to reduce any partial sums of forces.
1270 const int numIters = (numAtoms - 1 + kNumThreads) / kNumThreads;
1271 for (int iter = blockIdx.y; iter < numIters; iter += gridDim.y) {
1272 const int atomIndex = iter * blockDim.x + threadIdx.x;
1274 if (atomIndex < numAtoms) {
1275 gather_t<CT, kOrder> atomData;
1277 float4 xyzqi = xyzq[atomIndex + offset];
1278 // The boundary conditions are handled when the grid is loaded into shared memory, so we
1279 // just need to shift the solutions here
1280 float x = xyzqi.x + 0.5f;
1281 float y = xyzqi.y + 0.5f;
1282 float z = xyzqi.z + 0.5f;
1285 float frx = ((float)nfftx)*x;
1286 float fry = ((float)nffty)*y;
1287 float frz = ((float)nfftz)*z;
1289 int frxi = (int)floorf(frx);
1290 int fryi = (int)floorf(fry);
1291 int frzi = (int)floorf(frz);
1293 float wx = frx - (float)frxi;
1294 float wy = fry - (float)fryi;
1295 float wz = frz - (float)frzi;
1297 atomData.ix = frxi - patchGridOffset.x;
1298 atomData.iy = fryi - patchGridOffset.y;
1299 atomData.iz = frzi - patchGridOffset.z;
1300 atomData.charge = q;
1302 float3 theta_tmp[kOrder];
1303 float3 dtheta_tmp[kOrder];
1304 calc_theta_dtheta<float, float3, kOrder>(wx, wy, wz, theta_tmp, dtheta_tmp);
1307 for (int i=0;i < kOrder;i++) {
1308 // x theta values are stored in shared memory to reduce register pressure
1309 s_x_theta[i * kThetaStride + threadIdx.x].x = theta_tmp[i].x;
1310 s_x_theta[i * kThetaStride + threadIdx.x].y = dtheta_tmp[i].x;
1311 atomData.thetay[i] = theta_tmp[i].y;
1312 atomData.dthetay[i] = dtheta_tmp[i].y;
1313 atomData.thetaz[i] = theta_tmp[i].z;
1314 atomData.dthetaz[i] = dtheta_tmp[i].z;
1323 for (int xIter = 0; xIter < kOrder; xIter++) {
1324 const int xOffset = xIter;
1325 // To reduce register pressure, the x theta values are stored in shared memory
1326 float thx0 = s_x_theta[xOffset * kThetaStride + threadIdx.x].x;
1327 float dthx0 = s_x_theta[xOffset * kThetaStride + threadIdx.x].y;
1330 for (int yIter = 0; yIter < kOrder; yIter++) {
1332 for (int zIter = 0; zIter < kOrder; zIter++) {
1333 const int yOffset = yIter;
1334 const int zOffset = zIter;
1336 const int shared_idx_x = atomData.ix + xOffset;
1337 const int shared_idx_y = atomData.iy + yIter;
1338 const int shared_idx_z = atomData.iz + zIter;
1339 const int shared_idx = shared_idx_x + kPatchGridDimPad *
1340 (shared_idx_y + shared_idx_z * kPatchGridDim);
1341 // This load is essentially doing random accesses to shared memory, so
1342 // it results in a significant amount of bank conflicts
1343 const float grid_val = s_grid[shared_idx];
1345 float thy0 = atomData.thetay[yOffset];
1346 float thz0 = atomData.thetaz[zOffset];
1347 float dthy0 = atomData.dthetay[yOffset];
1348 float dthz0 = atomData.dthetaz[zOffset];
1350 f1 += dthx0 * thy0 * thz0 * grid_val;
1351 f2 += thx0 * dthy0 * thz0 * grid_val;
1352 f3 += thx0 * thy0 * dthz0 * grid_val;
1357 const float fx = -1.0f * atomData.charge * f1 * nfftx;
1358 const float fy = -1.0f * atomData.charge * f2 * nffty;
1359 const float fz = -1.0f * atomData.charge * f3 * nfftz;
1360 gather_force_store<FT>(fx, fy, fz, stride, atomIndex + offset, force);
1367 const int TILEDIM = 32;
1368 const int TILEROWS = 8;
1370 template <typename T>
1371 __device__ __forceinline__
1372 void transpose_xyz_yzx_device(
1373 const int x_in, const int y_in, const int z_in,
1374 const int x_out, const int y_out,
1375 const int nx, const int ny, const int nz,
1376 const int xsize_in, const int ysize_in,
1377 const int ysize_out, const int zsize_out,
1378 const T* data_in, T* data_out) {
1381 __shared__ T tile[TILEDIM][TILEDIM+1];
1383 // Read (x,y) data_in into tile (shared memory)
1384 for (int j=0;j < TILEDIM;j += TILEROWS)
1385 if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
1386 tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
1390 // Write (y,x) tile into data_out
1391 const int z_out = z_in;
1392 for (int j=0;j < TILEDIM;j += TILEROWS)
1393 if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
1394 data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
1398 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1400 template <typename T>
1401 __global__ void transpose_xyz_yzx_kernel(
1402 const int nx, const int ny, const int nz,
1403 const int xsize_in, const int ysize_in,
1404 const int ysize_out, const int zsize_out,
1405 const T* data_in, T* data_out) {
1407 int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1408 int y_in = blockIdx.y * TILEDIM + threadIdx.y;
1409 int z_in = blockIdx.z + threadIdx.z;
1411 int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1412 int y_out = blockIdx.y * TILEDIM + threadIdx.x;
1414 transpose_xyz_yzx_device<T>(
1419 ysize_out, zsize_out,
1424 __shared__ T tile[TILEDIM][TILEDIM+1];
1426 int x = blockIdx.x * TILEDIM + threadIdx.x;
1427 int y = blockIdx.y * TILEDIM + threadIdx.y;
1428 int z = blockIdx.z + threadIdx.z;
1430 // Read (x,y) data_in into tile (shared memory)
1431 for (int j=0;j < TILEDIM;j += TILEROWS)
1432 if ((x < nx) && (y + j < ny) && (z < nz))
1433 tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
1437 // Write (y,x) tile into data_out
1438 x = blockIdx.x * TILEDIM + threadIdx.y;
1439 y = blockIdx.y * TILEDIM + threadIdx.x;
1440 for (int j=0;j < TILEDIM;j += TILEROWS)
1441 if ((x + j < nx) && (y < ny) && (z < nz))
1442 data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
1447 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1448 // Batch index bi is encoded in blockIdx.z, where
1449 // blockIdx.z = 0...nz-1 are for batch 1
1450 // blockIdx.z = nz...2*nz-1 are for batch 2
1452 // gridDim.z = nz*numBatches
1454 template <typename T>
1455 __global__ void batchTranspose_xyz_yzx_kernel(
1456 const TransposeBatch<T>* batches,
1457 const int ny, const int nz,
1458 const int xsize_in, const int ysize_in) {
1460 int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1461 int y_in = blockIdx.y * TILEDIM + threadIdx.y;
1462 int z_in = (blockIdx.z % nz) + threadIdx.z;
1464 int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1465 int y_out = blockIdx.y * TILEDIM + threadIdx.x;
1467 int bi = blockIdx.z/nz;
1469 TransposeBatch<T> batch = batches[bi];
1471 int ysize_out = batch.ysize_out;
1472 int zsize_out = batch.zsize_out;
1473 T* data_in = batch.data_in;
1474 T* data_out = batch.data_out;
1476 transpose_xyz_yzx_device<T>(
1481 ysize_out, zsize_out,
1488 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1490 template <typename T>
1491 __forceinline__ __device__
1492 void transpose_xyz_yzx_dev(
1494 const int nx, const int ny, const int nz,
1495 const int xsize_in, const int ysize_in,
1496 const int xsize_out, const int ysize_out,
1497 const T* data_in, T* data_out) {
1500 __shared__ T tile[TILEDIM][TILEDIM+1];
1502 int x = blockIdx.x * TILEDIM + threadIdx.x;
1503 int y = blockIdx.y * TILEDIM + threadIdx.y;
1504 // int z = blockIdx.z + threadIdx.z;
1505 int z = blockz + threadIdx.z;
1507 // Read (x,y) data_in into tile (shared memory)
1508 for (int j=0;j < TILEDIM;j += TILEROWS)
1509 if ((x < nx) && (y + j < ny) && (z < nz))
1510 tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
1514 // Write (y,x) tile into data_out
1515 x = blockIdx.x * TILEDIM + threadIdx.y;
1516 y = blockIdx.y * TILEDIM + threadIdx.x;
1517 for (int j=0;j < TILEDIM;j += TILEROWS)
1518 if ((x + j < nx) && (y < ny) && (z < nz))
1519 data_out[y + (z + (x+j)*ysize_out)*xsize_out] = tile[threadIdx.x][threadIdx.y + j];
1524 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1525 // (nx, ny, nz) = size of the transposed volume
1526 // (xsize_in, ysize_in, zsize_in) = size of the input data
1527 // into nblock memory blocks
1529 template <typename T>
1530 __global__ void transpose_xyz_yzx_kernel(
1532 const int nx, const int ny, const int nz,
1533 const int xsize_in, const int ysize_in,
1534 const int xsize_out, const int ysize_out,
1535 const T* data_in, T* data_out) {
1537 const int iblock = blockIdx.z/nz;
1539 if (iblock < nblock) {
1540 transpose_xyz_yzx_dev(blockIdx.z % nz, nx, ny, nz,
1541 xsize_in, ysize_in, xsize_out, ysize_out,
1548 template <typename T>
1549 __device__ __forceinline__
1550 void transpose_xyz_zxy_device(
1551 const int x_in, const int y_in, const int z_in,
1552 const int x_out, const int z_out,
1553 const int nx, const int ny, const int nz,
1554 const int xsize_in, const int ysize_in,
1555 const int zsize_out, const int xsize_out,
1556 const T* data_in, T* data_out) {
1559 __shared__ T tile[TILEDIM][TILEDIM+1];
1561 // Read (x,z) data_in into tile (shared memory)
1562 for (int k=0;k < TILEDIM;k += TILEROWS)
1563 if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
1564 tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
1568 // Write (z,x) tile into data_out
1569 const int y_out = y_in;
1570 for (int k=0;k < TILEDIM;k += TILEROWS)
1571 if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
1572 data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
1576 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(z, x, y)
1578 template <typename T>
1579 __global__ void transpose_xyz_zxy_kernel(
1580 const int nx, const int ny, const int nz,
1581 const int xsize_in, const int ysize_in,
1582 const int zsize_out, const int xsize_out,
1583 const T* data_in, T* data_out) {
1585 int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1586 int y_in = blockIdx.z + threadIdx.z;
1587 int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1589 int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1590 int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1592 transpose_xyz_zxy_device<T>(
1593 x_in, y_in, z_in, x_out, z_out,
1596 zsize_out, xsize_out,
1602 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(z, x, y)
1603 // Batch index bi is encoded in blockIdx.z, where
1604 // blockIdx.z = 0...ny-1 are for batch 1
1605 // blockIdx.z = ny...2*ny-1 are for batch 2
1607 // gridDim.z = ny*numBatches
1609 template <typename T>
1610 __global__ void batchTranspose_xyz_zxy_kernel(
1611 const TransposeBatch<T>* batches,
1612 const int ny, const int nz,
1613 const int xsize_in, const int ysize_in) {
1615 int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1616 int y_in = (blockIdx.z % ny) + threadIdx.z;
1617 int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1619 int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1620 int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1622 int bi = blockIdx.z/ny;
1624 TransposeBatch<T> batch = batches[bi];
1626 int zsize_out = batch.zsize_out;
1627 int xsize_out = batch.xsize_out;
1628 T* data_in = batch.data_in;
1629 T* data_out = batch.data_out;
1631 transpose_xyz_zxy_device<T>(
1632 x_in, y_in, z_in, x_out, z_out,
1635 zsize_out, xsize_out,
1640 // XXX should be defined in a centralized place
1642 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
1644 template<int BLOCK_SIZE>
1645 __global__ void selfEnergyKernel(
1647 const float4 *atoms,
1652 double qq=0; // sum q^2 over all threads
1653 int i = threadIdx.x + blockIdx.x * blockDim.x;
1655 double q = atoms[i].w;
1659 typedef cub::BlockReduce<double, BLOCK_SIZE> BlockReduce;
1660 __shared__ typename BlockReduce::TempStorage temp_storage;
1661 qq = BlockReduce(temp_storage).Sum(qq);
1665 if (threadIdx.x == 0) {
1666 // each charge already scaled by sqrt(COULOMB * scaling * dielectric_1)
1667 // finish scaling self energy term
1668 double c = -ewaldcof * (1.0 / SQRT_PI);
1669 atomicAdd(selfEnergy, c*qq);
1673 double compute_selfEnergy(
1674 double *d_selfEnergy,
1675 const float4 *d_atoms,
1678 cudaStream_t stream)
1680 double selfEnergy = 0;
1681 const int block = 128;
1682 const int grid = (natoms + block - 1) / block;
1683 selfEnergyKernel<block><<<grid, block, 0, stream>>>(d_selfEnergy,
1684 d_atoms, natoms, ewaldcof);
1685 cudaCheck(cudaStreamSynchronize(stream));
1686 copy_DtoH<double>(d_selfEnergy, &selfEnergy, 1, stream);
1687 cudaCheck(cudaStreamSynchronize(stream));
1691 //#######################################################################################
1692 //#######################################################################################
1693 //#######################################################################################
1695 void spread_charge(const float4 *atoms, const int numAtoms,
1696 const int nfftx, const int nffty, const int nfftz,
1697 const int xsize, const int ysize, const int zsize,
1698 const int xdim, const int y00, const int z00,
1699 const bool periodicY, const bool periodicZ,
1700 float* data, const int order, cudaStream_t stream) {
1702 dim3 nthread, nblock;
1709 nblock.x = (numAtoms - 1)/nthread.x + 1;
1712 if (periodicY && periodicZ)
1713 spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1715 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1717 spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1719 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1721 spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1723 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1725 spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1727 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1734 nblock.x = (numAtoms - 1)/nthread.x + 1;
1737 if (periodicY && periodicZ)
1738 spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1740 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1742 spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1744 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1746 spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1748 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1750 spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1752 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1759 nblock.x = (numAtoms - 1)/nthread.x + 1;
1762 if (periodicY && periodicZ)
1763 spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1765 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1767 spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1769 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1771 spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1773 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1775 spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1777 nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1782 sprintf(str, "spread_charge, order %d not implemented",order);
1785 cudaCheck(cudaGetLastError());
1790 // JM new version of spread_charge routine
1791 void spread_charge_v2(
1792 const PatchLevelPmeData patchLevelPmeData,
1793 const float4 *atoms, const int numAtoms,
1794 const int nfftx, const int nffty, const int nfftz,
1795 const float nfftx_f, const float nffty_f, const float nfftz_f,
1797 const int xsize, const int ysize, const int zsize,
1798 const int xdim, const int y00, const int z00,
1799 const bool periodicY, const bool periodicZ,
1800 float* data, const int order, cudaStream_t stream) {
1802 dim3 nthread, nblock;
1804 if (patchLevelPmeData.compatible()) {
1805 const dim3 numBlocks = dim3(patchLevelPmeData.numPatches, 1, 1);
1806 const int sharedMemory = patchLevelPmeData.spreadChargeSharedBytes;
1807 cudaFuncSetAttribute(
1808 (void*)spread_charge_patch_8th<float, patchLevelPmeData.kPatchGridDim>,
1809 cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemory);
1810 spread_charge_patch_8th<float, patchLevelPmeData.kPatchGridDim>
1811 <<<numBlocks, PatchLevelPmeData::kNumThreads, sharedMemory, stream>>>(
1812 patchLevelPmeData.numPatches, patchLevelPmeData.patchGridDim,
1813 patchLevelPmeData.localRecords, patchLevelPmeData.d_patchGridOffsets,
1814 atoms, nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f,
1815 xsize, ysize, zsize, xdim, y00, z00, data
1817 cudaCheck(cudaGetLastError());
1826 nblock.x = (numAtoms - 1)/nthread.x + 1;
1829 if (periodicY && periodicZ)
1830 spread_charge_kernel_v2<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1832 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1834 spread_charge_kernel_v2<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1836 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1838 spread_charge_kernel_v2<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1840 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1842 spread_charge_kernel_v2<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1844 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1848 nthread.x = WARPSIZE;
1849 nthread.y = order3 / WARPSIZE;
1851 nblock.x = (numAtoms - 1)/nthread.x + 1;
1854 if (periodicY && periodicZ)
1855 spread_charge_kernel_v2<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1857 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1859 spread_charge_kernel_v2<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1861 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1863 spread_charge_kernel_v2<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1865 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1867 spread_charge_kernel_v2<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1869 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1873 nthread.x = WARPSIZE;
1874 nthread.y = order3 / WARPSIZE;
1876 nblock.x = (numAtoms - 1)/nthread.x + 1;
1879 if (periodicY && periodicZ)
1880 spread_charge_kernel_v2<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1882 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1884 spread_charge_kernel_v2<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1886 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1888 spread_charge_kernel_v2<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1890 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1892 spread_charge_kernel_v2<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1894 nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1899 sprintf(str, "spread_charge, order %d not implemented",order);
1902 // cudaCheck(cudaGetLastError());
1906 void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3,
1907 const int size1, const int size2, const int size3, const double kappa,
1908 const float recip1x, const float recip1y, const float recip1z,
1909 const float recip2x, const float recip2y, const float recip2z,
1910 const float recip3x, const float recip3y, const float recip3z,
1911 const double volume,
1912 const float* prefac1, const float* prefac2, const float* prefac3,
1913 const int k2_00, const int k3_00,
1914 const bool doEnergyVirial, double* energy, double* virial, float2* data,
1915 cudaStream_t stream) {
1924 int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1925 if (doEnergyVirial) {
1926 shmem_size = max(shmem_size, (int)((nthread/WARPSIZE)*sizeof(RecipVirial_t)));
1929 float piv_inv = (float)(1.0/(M_PI*volume));
1930 float fac = (float)(M_PI*M_PI/(kappa*kappa));
1932 if (doEnergyVirial) {
1934 scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
1935 (nfft1, nfft2, nfft3, size1, size2, size3,
1936 recip1x, recip1y, recip1z,
1937 recip2x, recip2y, recip2z,
1938 recip3x, recip3y, recip3z,
1939 prefac1, prefac2, prefac3,
1940 fac, piv_inv, k2_00, k3_00, data, energy, virial);
1942 scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
1943 (nfft1, nfft2, nfft3, size1, size2, size3,
1944 recip1x, recip1y, recip1z,
1945 recip2x, recip2y, recip2z,
1946 recip3x, recip3y, recip3z,
1947 prefac1, prefac2, prefac3,
1948 fac, piv_inv, k2_00, k3_00, data, energy, virial);
1952 scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
1953 (nfft1, nfft2, nfft3, size1, size2, size3,
1954 recip1x, recip1y, recip1z,
1955 recip2x, recip2y, recip2z,
1956 recip3x, recip3y, recip3z,
1957 prefac1, prefac2, prefac3,
1958 fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1960 scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
1961 (nfft1, nfft2, nfft3, size1, size2, size3,
1962 recip1x, recip1y, recip1z,
1963 recip2x, recip2y, recip2z,
1964 recip3x, recip3y, recip3z,
1965 prefac1, prefac2, prefac3,
1966 fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1969 cudaCheck(cudaGetLastError());
1974 const PatchLevelPmeData patchLevelPmeData,
1975 const float4 *atoms, const int numAtoms,
1976 // const float recip11, const float recip22, const float recip33,
1977 const int nfftx, const int nffty, const int nfftz,
1978 const int xsize, const int ysize, const int zsize,
1979 const int xdim, const int y00, const int z00,
1980 const bool periodicY, const bool periodicZ,
1981 const float* data, const int order, float3* force,
1983 const cudaTextureObject_t gridTexObj,
1985 cudaStream_t stream) {
1987 if (patchLevelPmeData.compatible()) {
1988 const dim3 numBlocks = dim3(patchLevelPmeData.numPatches, 1, 1);
1989 const int sharedMemory = patchLevelPmeData.spreadChargeSharedBytes;
1991 cudaFuncSetAttribute(
1992 (void*)gather_force_patch_8th<float, float3, PatchLevelPmeData::kPatchGridDim>,
1993 cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemory);
1994 gather_force_patch_8th<float, float3, PatchLevelPmeData::kPatchGridDim>
1995 <<<numBlocks, PatchLevelPmeData::kNumThreads, sharedMemory, stream>>>(
1996 patchLevelPmeData.numPatches, patchLevelPmeData.patchGridDim,
1997 patchLevelPmeData.localRecords, patchLevelPmeData.d_patchGridOffsets,
1998 atoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data,
2002 cudaCheck(cudaGetLastError());
2004 dim3 nthread(32, 2, 1);
2005 dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
2006 // dim3 nblock(npatch, 1, 1);
2010 if (periodicY && periodicZ)
2011 gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
2012 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2013 // recip11, recip22, recip33,
2020 gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
2021 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2022 // recip11, recip22, recip33,
2029 gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
2030 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2031 // recip11, recip22, recip33,
2038 gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
2039 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2040 // recip11, recip22, recip33,
2049 if (periodicY && periodicZ)
2050 gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
2051 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2052 // recip11, recip22, recip33,
2059 gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
2060 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2061 // recip11, recip22, recip33,
2068 gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
2069 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2070 // recip11, recip22, recip33,
2077 gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
2078 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2079 // recip11, recip22, recip33,
2088 if (periodicY && periodicZ)
2089 gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
2090 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2091 // recip11, recip22, recip33,
2098 gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
2099 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2100 // recip11, recip22, recip33,
2107 gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
2108 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2109 // recip11, recip22, recip33,
2116 gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
2117 (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2118 // recip11, recip22, recip33,
2128 sprintf(str, "gather_force, order %d not implemented",order);
2131 cudaCheck(cudaGetLastError());
2138 void transpose_xyz_yzx(
2139 const int nx, const int ny, const int nz,
2140 const int xsize_in, const int ysize_in,
2141 const int ysize_out, const int zsize_out,
2142 const float2* data_in, float2* data_out, cudaStream_t stream) {
2144 dim3 numthread(TILEDIM, TILEROWS, 1);
2145 dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
2147 transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
2148 (nx, ny, nz, xsize_in, ysize_in,
2149 ysize_out, zsize_out,
2152 cudaCheck(cudaGetLastError());
2156 // Batched transpose
2158 void batchTranspose_xyz_yzx(
2159 const int numBatches, TransposeBatch<float2>* batches,
2160 const int max_nx, const int ny, const int nz,
2161 const int xsize_in, const int ysize_in, cudaStream_t stream) {
2163 dim3 numthread(TILEDIM, TILEROWS, 1);
2164 dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
2166 batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
2167 (batches, ny, nz, xsize_in, ysize_in);
2169 cudaCheck(cudaGetLastError());
2175 void transpose_xyz_zxy(
2176 const int nx, const int ny, const int nz,
2177 const int xsize_in, const int ysize_in,
2178 const int zsize_out, const int xsize_out,
2179 const float2* data_in, float2* data_out, cudaStream_t stream) {
2181 dim3 numthread(TILEDIM, TILEROWS, 1);
2182 dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
2184 transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
2185 (nx, ny, nz, xsize_in, ysize_in,
2186 zsize_out, xsize_out,
2189 cudaCheck(cudaGetLastError());
2193 // Batched transpose
2195 void batchTranspose_xyz_zxy(
2196 const int numBatches, TransposeBatch<float2>* batches,
2197 const int max_nx, const int ny, const int nz,
2198 const int xsize_in, const int ysize_in,
2199 cudaStream_t stream) {
2201 dim3 numthread(TILEDIM, TILEROWS, 1);
2202 dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
2204 batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
2205 (batches, ny, nz, xsize_in, ysize_in);
2207 cudaCheck(cudaGetLastError());
2210 template <int block_size, bool alchDecouple>
2211 __global__ void calcSelfEnergyFEPKernel(double* d_selfEnergy, double* d_selfEnergy_FEP, const float4* d_atoms, const int* d_partition, const int len, const double ewaldcof, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down) {
2212 const int i = blockIdx.x * blockDim.x + threadIdx.x;
2216 double scaleLambda1 = 0;
2217 double scaleLambda2 = 0;
2218 // NOTE: len is NOT the full length of d_atoms array, it's just the length of the first grid
2219 // d_atoms always has at least 2 grids when the code reaches here since we are doing alchemical transformations
2221 switch (d_partition[i]) {
2223 scaleLambda1 = 1.0f;
2224 scaleLambda2 = 1.0f;
2225 q = d_atoms[i].w; // or d_atoms_grid2[i].w
2229 scaleLambda1 = lambda1Up; // lambda1 up
2230 scaleLambda2 = lambda2Up; // lambda2 up
2235 scaleLambda1 = lambda1Down; // lambda1 down
2236 scaleLambda2 = lambda2Down; // lambda2 down
2237 // the charges in the #1 grid are already scaled to 0
2238 // so I fetch the charges from the second grid
2239 q = d_atoms[i + len].w;
2247 qq1 = q * q * scaleLambda1;
2248 qq2 = q * q * scaleLambda2;
2251 typedef cub::BlockReduce<double, block_size> BlockReduce;
2252 __shared__ typename BlockReduce::TempStorage temp_storage;
2253 qq1 = BlockReduce(temp_storage).Sum(qq1);
2256 qq2 = BlockReduce(temp_storage).Sum(qq2);
2258 if (threadIdx.x == 0) {
2259 const double c = -ewaldcof * (1.0 / SQRT_PI);
2260 atomicAdd(d_selfEnergy, c*qq1);
2261 atomicAdd(d_selfEnergy_FEP, c*qq2);
2266 void calcSelfEnergyFEPWrapper(double* d_selfEnergy, double* d_selfEnergy_FEP, double& h_selfEnergy, double& h_selfEnergyFEP, const float4* d_atoms, const int* d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream) {
2267 const int block_size = 128;
2268 const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
2270 calcSelfEnergyFEPKernel<block_size, true><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_FEP, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda2Up, lambda1Down, lambda2Down);
2272 calcSelfEnergyFEPKernel<block_size, false><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_FEP, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda2Up, lambda1Down, lambda2Down);
2274 copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
2275 copy_DtoH<double>(d_selfEnergy_FEP, &h_selfEnergyFEP, 1, stream);
2276 cudaCheck(cudaStreamSynchronize(stream));
2279 template <int block_size, bool alchDecouple>
2280 __global__ void calcSelfEnergyTIKernel(double* d_selfEnergy, double* d_selfEnergy_TI_1, double* d_selfEnergy_TI_2, const float4* d_atoms, const int* d_partition, const int len, const double ewaldcof, const double lambda1Up, const double lambda1Down) {
2281 const int i = blockIdx.x * blockDim.x + threadIdx.x;
2286 double factor_ti_1 = 0.0;
2287 double factor_ti_2 = 0.0;
2288 double elecLambda1 = 0.0;
2290 switch (d_partition[i]) {
2295 q = d_atoms[i].w; // or d_atoms_grid2[i].w
2299 elecLambda1 = lambda1Up;
2306 elecLambda1 = lambda1Down;
2309 q = d_atoms[i + len].w;
2316 qq = q * q * elecLambda1;
2317 qq1 = q * q * factor_ti_1;
2318 qq2 = q * q * factor_ti_2;
2321 typedef cub::BlockReduce<double, block_size> BlockReduce;
2322 __shared__ typename BlockReduce::TempStorage temp_storage;
2323 qq = BlockReduce(temp_storage).Sum(qq);
2325 qq1 = BlockReduce(temp_storage).Sum(qq1);
2327 qq2 = BlockReduce(temp_storage).Sum(qq2);
2329 if (threadIdx.x == 0) {
2330 const double c = -ewaldcof * (1.0 / SQRT_PI);
2331 atomicAdd(d_selfEnergy, c*qq);
2332 atomicAdd(d_selfEnergy_TI_1, c*qq1);
2333 atomicAdd(d_selfEnergy_TI_2, c*qq2);
2338 void calcSelfEnergyTIWrapper(double* d_selfEnergy, double* d_selfEnergy_TI_1, double* d_selfEnergy_TI_2, double& h_selfEnergy, double& h_selfEnergy_TI_1, double& h_selfEnergy_TI_2, const float4* d_atoms, const int* d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream) {
2339 const int block_size = 128;
2340 const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
2342 calcSelfEnergyTIKernel<block_size, true><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda1Down);
2344 calcSelfEnergyTIKernel<block_size, false><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda1Down);
2346 copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
2347 copy_DtoH<double>(d_selfEnergy_TI_1, &h_selfEnergy_TI_1, 1, stream);
2348 copy_DtoH<double>(d_selfEnergy_TI_2, &h_selfEnergy_TI_2, 1, stream);
2349 cudaCheck(cudaStreamSynchronize(stream));
2352 template <int NUM_ARRAYS>
2353 __global__ void scaleAndMergeForceKernel(float3* forces, const float* factors, const int num_atoms) {
2354 const int i = blockIdx.x * blockDim.x + threadIdx.x;
2355 if (i >= num_atoms) return;
2356 switch (NUM_ARRAYS) {
2358 forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1];
2359 forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1];
2360 forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1];
2364 forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2];
2365 forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2];
2366 forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2];
2370 forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2] + forces[i + 3 * num_atoms].x * factors[3];
2371 forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2] + forces[i + 3 * num_atoms].y * factors[3];
2372 forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2] + forces[i + 3 * num_atoms].z * factors[3];
2376 forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2] + forces[i + 3 * num_atoms].x * factors[3] + forces[i + 4 * num_atoms].x * factors[4];
2377 forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2] + forces[i + 3 * num_atoms].y * factors[3] + forces[i + 4 * num_atoms].y * factors[4];
2378 forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2] + forces[i + 3 * num_atoms].z * factors[3] + forces[i + 4 * num_atoms].z * factors[4];
2384 void scaleAndMergeForceWrapper(float3* forces, const float* factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream) {
2385 const int block_size = 128;
2386 const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
2387 switch (num_arrays) {
2388 case 2: scaleAndMergeForceKernel<2><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2389 case 3: scaleAndMergeForceKernel<3><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2390 case 4: scaleAndMergeForceKernel<4><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2391 case 5: scaleAndMergeForceKernel<5><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2393 cudaCheck(cudaGetLastError());