NAMD
CudaPmeSolverUtilKernel.cu
Go to the documentation of this file.
1 #ifdef WIN32
2 #define _USE_MATH_DEFINES
3 #endif
4 
5 #include <math.h>
6 #include <stdio.h>
7 #ifdef NAMD_CUDA
8 #include <cuda.h>
9 #if __CUDACC_VER_MAJOR__ >= 11
10 #include <cub/cub.cuh>
11 #else
12 #include <namd_cub/cub.cuh>
13 #endif
14 #endif
15 
16 #ifdef NAMD_HIP
17 #include <hip/hip_runtime.h>
18 #include <hip/hip_runtime_api.h>
19 #include <hipcub/hipcub.hpp>
20 #define cub hipcub
21 #endif
22 
23 #include "HipDefines.h"
24 #include "CudaUtils.h"
25 #include "CudaRecord.h"
26 #include "CudaPmeSolverUtilKernel.h"
27 
28 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
29 // CCELEC is 1/ (4 pi eps ) in AKMA units, conversion from SI
30 // units: CCELEC = e*e*Na / (4*pi*eps*1Kcal*1A)
31 //
32 // parameter :: CCELEC=332.0636D0 ! old value of dubious origin
33 // parameter :: CCELEC=331.843D0 ! value from 1986-1987 CRC Handbook
34 // ! of Chemistry and Physics
35 // real(chm_real), parameter :: &
36 // CCELEC_amber = 332.0522173D0, &
37 // CCELEC_charmm = 332.0716D0 , &
38 // CCELEC_discover = 332.054D0 , &
39 // CCELEC_namd = 332.0636D0
40 //const double ccelec = 332.0636;
41 //const double half_ccelec = 0.5*ccelec;
42 //const float ccelec_float = 332.0636f;
43 
44 /*
45 // Structure into which virials are stored
46 // NOTE: This structure is only used for computing addresses
47 struct Virial_t {
48  double sforce_dp[27][3];
49  long long int sforce_fp[27][3];
50  double virmat[9];
51  // Energies start here ...
52 };
53 */
54 
55 // Local structure for scalar_sum -function for energy and virial reductions
56 struct RecipVirial_t {
57  double energy;
58  double virial[6];
59 };
60 
61 
62 __forceinline__ __device__ double expfunc(const double x) {
63  return exp(x);
64 }
65 
66 __forceinline__ __device__ float expfunc(const float x) {
67  return __expf(x);
68 }
69 
70 //
71 // Performs scalar sum on data(nfft1, nfft2, nfft3)
72 // T = float or double
73 // T2 = float2 or double2
74 //
75 template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
76 __global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3,
77  const int size1, const int size2, const int size3,
78  const T recip1x, const T recip1y, const T recip1z,
79  const T recip2x, const T recip2y, const T recip2z,
80  const T recip3x, const T recip3y, const T recip3z,
81  const T* prefac1, const T* prefac2, const T* prefac3,
82  const T pi_ewald, const T piv_inv,
83  const int k2_00, const int k3_00,
84  T2* data,
85  double* __restrict__ energy_recip,
86  double* __restrict__ virial) {
87  // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
88  #ifdef NAMD_CUDA
89  extern __shared__ T sh_prefac[];
90  #else //NAMD_HIP
91  HIP_DYNAMIC_SHARED( T, sh_prefac)
92  #endif
93 
94  // Create pointers to shared memory
95  T* sh_prefac1 = (T *)&sh_prefac[0];
96  T* sh_prefac2 = (T *)&sh_prefac[nfft1];
97  T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
98 
99  // Calculate start position (k1, k2, k3) for each thread
100  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
101  int k3 = tid/(size1*size2);
102  tid -= k3*size1*size2;
103  int k2 = tid/size1;
104  int k1 = tid - k2*size1;
105 
106  // Starting position in data
107  int pos = k1 + (k2 + k3*size2)*size1;
108 
109  // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
110  k2 += k2_00;
111  k3 += k3_00;
112 
113  // Calculate limits w.r.t. global coordinates
114  const int lim2 = size2 + k2_00;
115  const int lim3 = size3 + k3_00;
116 
117  // Calculate increments (k1_inc, k2_inc, k3_inc)
118  int tot_inc = blockDim.x*gridDim.x;
119  const int k3_inc = tot_inc/(size1*size2);
120  tot_inc -= k3_inc*size1*size2;
121  const int k2_inc = tot_inc/size1;
122  const int k1_inc = tot_inc - k2_inc*size1;
123 
124  // Set data[0] = 0 for the global (0,0,0)
125  if (k1 == 0 && k2 == 0 && k3 == 0) {
126  T2 zero;
127  zero.x = (T)0;
128  zero.y = (T)0;
129  data[0] = zero;
130  // Increment position
131  k1 += k1_inc;
132  pos += k1_inc;
133  if (k1 >= size1) {
134  k1 -= size1;
135  k2++;
136  }
137  k2 += k2_inc;
138  pos += k2_inc*size1;
139  if (k2 >= lim2) {
140  k2 -= size2;
141  k3++;
142  }
143  k3 += k3_inc;
144  pos += k3_inc*size1*size2;
145  }
146 
147  // Load prefac data into shared memory
148  {
149  int t = threadIdx.x;
150  while (t < nfft1) {
151  sh_prefac1[t] = prefac1[t];
152  t += blockDim.x;
153  }
154  t = threadIdx.x;
155  while (t < nfft2) {
156  sh_prefac2[t] = prefac2[t];
157  t += blockDim.x;
158  }
159  t = threadIdx.x;
160  while (t < nfft3) {
161  sh_prefac3[t] = prefac3[t];
162  t += blockDim.x;
163  }
164  }
165  BLOCK_SYNC;
166 
167  double energy = 0.0;
168  double virial0 = 0.0;
169  double virial1 = 0.0;
170  double virial2 = 0.0;
171  double virial3 = 0.0;
172  double virial4 = 0.0;
173  double virial5 = 0.0;
174 
175  // If nfft1 is odd, set nfft1_half to impossible value so that
176  // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )"
177  // is never satisfied
178  const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
179  const int nfft2_half = nfft2/2;
180  const int nfft3_half = nfft3/2;
181 
182  while (k3 < lim3) {
183 
184  T2 q = data[pos];
185 
186  int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
187  int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
188 
189  T m1, m2, m3;
190  if (doOrtho) {
191  m1 = recip1x*k1;
192  m2 = recip2y*k2s;
193  m3 = recip3z*k3s;
194  } else {
195  m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
196  m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
197  m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
198  }
199 
200  T msq = m1*m1 + m2*m2 + m3*m3;
201  T msq_inv = ((T)1)/msq;
202 
203  T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
204  T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
205  if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
206  T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
207  T C = xp3*msq_inv;
208  T theta = theta3*C;
209 
210  if (calc_energy_virial) {
211  T fac = q2*C;
212  T vir = ((T)2)*(pi_ewald + msq_inv);
213  energy += fac;
214  virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
215  virial1 += (double)(fac*vir*m1*m2);
216  virial2 += (double)(fac*vir*m1*m3);
217  virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
218  virial4 += (double)(fac*vir*m2*m3);
219  virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
220  }
221 
222  q.x *= theta;
223  q.y *= theta;
224  data[pos] = q;
225 
226  // Increment position
227  k1 += k1_inc;
228  pos += k1_inc;
229  if (k1 >= size1) {
230  k1 -= size1;
231  k2++;
232  }
233  k2 += k2_inc;
234  pos += k2_inc*size1;
235  if (k2 >= lim2) {
236  k2 -= size2;
237  k3++;
238  }
239  k3 += k3_inc;
240  pos += k3_inc*size2*size1;
241  }
242 
243  // Reduce energy and virial
244  if (calc_energy_virial) {
245  const int tid = threadIdx.x & (warpSize-1);
246  const int base = (threadIdx.x/warpSize);
247  volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
248  // Reduce within warps
249  for (int d=warpSize/2;d >= 1;d /= 2) {
250  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
251  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
252  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
253  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
254  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
255  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
256  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
257  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
258  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
259  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
260  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
261  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
262  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
263  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
264  }
265  // Reduce between warps
266  // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
267  BLOCK_SYNC;
268  if (tid == 0) {
269  sh_ev[base].energy = energy;
270  sh_ev[base].virial[0] = virial0;
271  sh_ev[base].virial[1] = virial1;
272  sh_ev[base].virial[2] = virial2;
273  sh_ev[base].virial[3] = virial3;
274  sh_ev[base].virial[4] = virial4;
275  sh_ev[base].virial[5] = virial5;
276  }
277  BLOCK_SYNC;
278  if (base == 0) {
279  energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
280  virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
281  virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
282  virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
283  virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
284  virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
285  virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
286  for (int d=warpSize/2;d >= 1;d /= 2) {
287  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
288  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
289  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
290  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
291  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
292  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
293  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
294  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
295  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
296  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
297  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
298  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
299  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
300  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
301  }
302  }
303 
304  if (threadIdx.x == 0) {
305  atomicAdd(energy_recip, energy*0.5);
306  virial0 *= -0.5;
307  virial1 *= -0.5;
308  virial2 *= -0.5;
309  virial3 *= -0.5;
310  virial4 *= -0.5;
311  virial5 *= -0.5;
312  atomicAdd(&virial[0], virial0);
313  atomicAdd(&virial[1], virial1);
314  atomicAdd(&virial[2], virial2);
315  atomicAdd(&virial[3], virial3);
316  atomicAdd(&virial[4], virial4);
317  atomicAdd(&virial[5], virial5);
318  }
319 
320  }
321 
322 }
323 
324 template <typename T>
325 __forceinline__ __device__ void write_grid(const float val, const int ind,
326  T* data) {
327  atomicAdd(&data[ind], (T)val);
328 }
329 
330 //
331 // General version for any order
332 //
333 template <typename T, int order>
334 __forceinline__ __device__ void calc_one_theta(const T w, T *theta) {
335 
336  theta[order-1] = ((T)0);
337  theta[1] = w;
338  theta[0] = ((T)1) - w;
339 
340 #pragma unroll
341  for (int k=3;k <= order-1;k++) {
342  T div = ((T)1) / (T)(k-1);
343  theta[k-1] = div*w*theta[k-2];
344 #pragma unroll
345  for (int j=1;j <= k-2;j++) {
346  theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
347  }
348  theta[0] = div*(((T)1) - w)*theta[0];
349  }
350 
351  //--- one more recursion
352  T div = ((T)1) / (T)(order-1);
353  theta[order-1] = div*w*theta[order-2];
354 #pragma unroll
355  for (int j=1;j <= order-2;j++) {
356  theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
357  }
358 
359  theta[0] = div*(((T)1) - w)*theta[0];
360 }
361 
362 //
363 // Calculate theta and dtheta for general order bspline
364 //
365 template <typename T, typename T3, int order>
366 __forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta) {
367 
368  theta[order-1].x = ((T)0);
369  theta[order-1].y = ((T)0);
370  theta[order-1].z = ((T)0);
371  theta[1].x = wx;
372  theta[1].y = wy;
373  theta[1].z = wz;
374  theta[0].x = ((T)1) - wx;
375  theta[0].y = ((T)1) - wy;
376  theta[0].z = ((T)1) - wz;
377 
378 #pragma unroll
379  for (int k=3;k <= order-1;k++) {
380  T div = ((T)1) / (T)(k-1);
381  theta[k-1].x = div*wx*theta[k-2].x;
382  theta[k-1].y = div*wy*theta[k-2].y;
383  theta[k-1].z = div*wz*theta[k-2].z;
384 #pragma unroll
385  for (int j=1;j <= k-2;j++) {
386  theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
387  theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
388  theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
389  }
390  theta[0].x = div*(((T)1) - wx)*theta[0].x;
391  theta[0].y = div*(((T)1) - wy)*theta[0].y;
392  theta[0].z = div*(((T)1) - wz)*theta[0].z;
393  }
394 
395  //--- perform standard b-spline differentiation
396  dtheta[0].x = -theta[0].x;
397  dtheta[0].y = -theta[0].y;
398  dtheta[0].z = -theta[0].z;
399 #pragma unroll
400  for (int j=2;j <= order;j++) {
401  dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
402  dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
403  dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
404  }
405 
406  //--- one more recursion
407  T div = ((T)1) / (T)(order-1);
408  theta[order-1].x = div*wx*theta[order-2].x;
409  theta[order-1].y = div*wy*theta[order-2].y;
410  theta[order-1].z = div*wz*theta[order-2].z;
411 #pragma unroll
412  for (int j=1;j <= order-2;j++) {
413  theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
414  theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
415  theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
416  }
417 
418  theta[0].x = div*(((T)1) - wx)*theta[0].x;
419  theta[0].y = div*(((T)1) - wy)*theta[0].y;
420  theta[0].z = div*(((T)1) - wz)*theta[0].z;
421 }
422 
423 //
424 // Spreads the charge on the grid. Calculates theta and dtheta on the fly
425 // blockDim.x = Number of atoms each block loads
426 // blockDim.y*blockDim.x/order3 = Number of atoms we spread at once
427 //
428 template <typename AT, int order, bool periodicY, bool periodicZ>
429 __global__ void
430 spread_charge_kernel(const float4 *xyzq, const int ncoord,
431  const int nfftx, const int nffty, const int nfftz,
432  const int xsize, const int ysize, const int zsize,
433  const int xdim, const int y00, const int z00,
434  AT* data) {
435 
436  // Shared memory use:
437  // order = 4: 1920 bytes
438  // order = 6: 2688 bytes
439  // order = 8: 3456 bytes
440  __shared__ int sh_ix[32];
441  __shared__ int sh_iy[32];
442  __shared__ int sh_iz[32];
443  __shared__ float sh_thetax[order*32];
444  __shared__ float sh_thetay[order*32];
445  __shared__ float sh_thetaz[order*32];
446 
447  // Process atoms pos to pos_end-1 (blockDim.x)
448  const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
449  const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
450 
451  if (pos < pos_end && threadIdx.y == 0) {
452 
453  float4 xyzqi = xyzq[pos];
454  float x = xyzqi.x;
455  float y = xyzqi.y;
456  float z = xyzqi.z;
457  float q = xyzqi.w;
458 
459  float frx = ((float)nfftx)*x;
460  float fry = ((float)nffty)*y;
461  float frz = ((float)nfftz)*z;
462 
463  int frxi = (int)frx;
464  int fryi = (int)fry;
465  int frzi = (int)frz;
466 
467  float wx = frx - (float)frxi;
468  float wy = fry - (float)fryi;
469  float wz = frz - (float)frzi;
470 
471  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
472  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
473 
474  sh_ix[threadIdx.x] = frxi;
475  sh_iy[threadIdx.x] = fryi - y00;
476  sh_iz[threadIdx.x] = frzi - z00;
477 
478  float theta[order];
479 
480  calc_one_theta<float, order>(wx, theta);
481 #pragma unroll
482  for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
483 
484  calc_one_theta<float, order>(wy, theta);
485 #pragma unroll
486  for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
487 
488  calc_one_theta<float, order>(wz, theta);
489 #pragma unroll
490  for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
491 
492  }
493 
494  BLOCK_SYNC;
495 
496  // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
497  // NOTE: Only tid=0...order*order*order-1 do any computation
498  const int order3 = ((order*order*order-1)/32 + 1)*32;
499  const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
500  const int x0 = tid % order;
501  const int y0 = (tid / order) % order;
502  const int z0 = tid / (order*order);
503 
504  // Loop over atoms pos..pos_end-1
505  int iadd = blockDim.x*blockDim.y/order3;
506  int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
507  int iend = pos_end - blockIdx.x*blockDim.x;
508  for (;i < iend;i += iadd) {
509  int x = sh_ix[i] + x0;
510  int y = sh_iy[i] + y0;
511  int z = sh_iz[i] + z0;
512 
513  if (x >= nfftx) x -= nfftx;
514 
515  if (periodicY && (y >= nffty)) y -= nffty;
516  if (!periodicY && (y < 0 || y >= ysize)) continue;
517 
518  if (periodicZ && (z >= nfftz)) z -= nfftz;
519  if (!periodicZ && (z < 0 || z >= zsize)) continue;
520 
521  // Get position on the grid
522  int ind = x + xdim*(y + ysize*(z));
523 
524  // Here we unroll the 6x6x6 loop with 216 threads.
525  // NOTE: We use 7*32=224 threads to do this
526  // Calculate interpolated charge value and store it to global memory
527 
528  if (tid < order*order*order)
529  write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
530  }
531 
532 }
533 
534 
535 template <typename AT, int order, bool periodicY, bool periodicZ>
536 __global__ void
537 spread_charge_kernel_v2(const float4 *xyzq, const int ncoord,
538  const int nfftx, const int nffty, const int nfftz,
539  const float nfftx_f, const float nffty_f,
540  const float nfftz_f,
541  const int xsize, const int ysize, const int zsize,
542  const int xdim, const int y00, const int z00,
543  AT* data) {
544 
545  // Shared memory use:
546  // order = 4: 1920 bytes
547  // order = 6: 2688 bytes
548  // order = 8: 3456 bytes
549  __shared__ int sh_ix[WARPSIZE];
550  __shared__ int sh_iy[WARPSIZE];
551  __shared__ int sh_iz[WARPSIZE];
552  __shared__ float sh_thetax[order*WARPSIZE];
553  __shared__ float sh_thetay[order*WARPSIZE];
554  __shared__ float sh_thetaz[order*WARPSIZE];
555 
556  // Process atoms pos to pos_end-1 (blockDim.x)
557  const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
558  const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
559  // const float invorder = 1.f/order;
560 
561  if (pos < pos_end && threadIdx.y == 0) {
562 
563  float4 xyzqi = xyzq[pos];
564  float x = xyzqi.x;
565  float y = xyzqi.y;
566  float z = xyzqi.z;
567  float q = xyzqi.w;
568 
569  float frx = (nfftx_f)*x;
570  float fry = (nffty_f)*y;
571  float frz = (nfftz_f)*z;
572 
573  int frxi = (int)frx;
574  int fryi = (int)fry;
575  int frzi = (int)frz;
576 
577  float wx = frx - (float)frxi;
578  float wy = fry - (float)fryi;
579  float wz = frz - (float)frzi;
580 
581  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
582  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
583 
584  sh_ix[threadIdx.x] = frxi;
585  sh_iy[threadIdx.x] = fryi - y00;
586  sh_iz[threadIdx.x] = frzi - z00;
587 
588  float theta[order];
589 
590  calc_one_theta<float, order>(wx, theta);
591 #pragma unroll
592  for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
593 
594  calc_one_theta<float, order>(wy, theta);
595 #pragma unroll
596  for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
597 
598  calc_one_theta<float, order>(wz, theta);
599 #pragma unroll
600  for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
601 
602  }
603 
604  BLOCK_SYNC;
605 
606  // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
607  // NOTE: Only tid=0...order*order*order-1 do any computation
608  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
609  const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
610  const int x0 = tid % order;
611  const int y0 = (tid / order) % order;
612  //const int y0 = (int)(tid * invorder) % order;
613  const int z0 = tid / (order*order);
614  //const int z0 = tid * invorder * invorder;
615 
616  // Loop over atoms pos..pos_end-1
617  int iadd = blockDim.x*blockDim.y/order3;
618  int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
619  int iend = pos_end - blockIdx.x*blockDim.x;
620  for (;i < iend;i += iadd) {
621  int x = sh_ix[i] + x0;
622  int y = sh_iy[i] + y0;
623  int z = sh_iz[i] + z0;
624 
625  if (x >= nfftx) x -= nfftx;
626 
627  if (periodicY && (y >= nffty)) y -= nffty;
628  if (!periodicY && (y < 0 || y >= ysize)) continue;
629 
630  if (periodicZ && (z >= nfftz)) z -= nfftz;
631  if (!periodicZ && (z < 0 || z >= zsize)) continue;
632 
633  // Get position on the grid
634  int ind = x + xdim*(y + ysize*(z));
635 
636  // Here we unroll the 6x6x6 loop with 216 threads.
637  // NOTE: We use 7*32=224 threads to do this
638  // Calculate interpolated charge value and store it to global memory
639 
640  if (tid < order*order*order)
641  write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
642  }
643 
644 }
645 
646 
647 template<typename AT, int kPatchGridDim>
648 __global__ void
649 __launch_bounds__(PatchLevelPmeData::kNumThreads, 4)
650 spread_charge_patch_8th(
651  const int numPatches,
652  const int3 patchGridDim,
653  const CudaLocalRecord* localRecords,
654  const int3* patchGridOffsets,
655  const float4 *xyzq,
656  const int nfftx, const int nffty, const int nfftz,
657  const float nfftx_f, const float nffty_f,
658  const float nfftz_f,
659  const int xsize, const int ysize, const int zsize,
660  const int xdim, const int y00, const int z00,
661  AT* data
662 ) {
663  // Define local copies of constexpr variables
664  constexpr int kNumThreads = PatchLevelPmeData::kNumThreads;
665  constexpr int kThetaStride = kNumThreads + PatchLevelPmeData::kThetaPad;
666  constexpr int kDims = PatchLevelPmeData::kDim;
667  constexpr int kPatchGridDimPad = PatchLevelPmeData::kPatchGridDimPad;
668  constexpr int kOrder = 8;
669 
670  // Setup shared memory buffers
671  extern __shared__ float sharedBuffer[];
672  char4* s_indices = (char4*) sharedBuffer;
673  float* s_theta = (float*) &sharedBuffer[kNumThreads];
674  float* s_grid = (float*) &sharedBuffer[kNumThreads + kOrder * kDims * kThetaStride];
675 
676  // Zero out grid shared memory buffer since we will be accumulating into it
677  const int numGridPoints = kPatchGridDimPad * kPatchGridDim * kPatchGridDim;
678  for (int i = threadIdx.x; i < numGridPoints; i += blockDim.x) {
679  s_grid[i] = 0.0f;
680  }
681  __syncthreads();
682 
683  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
684  const int numAtoms = localRecords[patchIndex].numAtoms;
685  const int offset = localRecords[patchIndex].bufferOffset;
686  const int3 patchGridOffset = patchGridOffsets[patchIndex];
687 
688  const int numIters = (numAtoms - 1 + kNumThreads) / kNumThreads;
689  // Iterate over all the atoms in the patch in chunks as we need to stash theta values in
690  // shared memory
691  for (int iter = blockIdx.y; iter < numIters; iter += gridDim.y) {
692  //
693  // Compute thetas with each thread processing a different atom
694  //
695  NAMD_WARP_SYNC(WARP_FULL_MASK);
696  const int atomIndex = iter * blockDim.x + threadIdx.x;
697  if (atomIndex < numAtoms) {
698  float4 xyzqi = xyzq[atomIndex + offset];
699 
700  // This will just shift the unscaled positions to be between 0 and 1 instead of
701  // -0.5 and 0.5 without handling the boundary conditions. The boundary conditions
702  // will be handled when the grid is written from shared to global memory
703  float x = xyzqi.x + 0.5f;
704  float y = xyzqi.y + 0.5f;
705  float z = xyzqi.z + 0.5f;
706  float q = xyzqi.w;
707 
708  float frx = (nfftx_f)*x;
709  float fry = (nffty_f)*y;
710  float frz = (nfftz_f)*z;
711 
712  int frxi = (int)floorf(frx);
713  int fryi = (int)floorf(fry);
714  int frzi = (int)floorf(frz);
715 
716  float wx = frx - (float)frxi;
717  float wy = fry - (float)fryi;
718  float wz = frz - (float)frzi;
719 
720  // Because these are indexes into the shared memory buffer, it should be safe to store
721  // them as chars
722  s_indices[threadIdx.x].x = (char) (frxi - patchGridOffset.x);
723  s_indices[threadIdx.x].y = (char) (fryi - patchGridOffset.y);
724  s_indices[threadIdx.x].z = (char) (frzi - patchGridOffset.z);
725 
726  float theta[kOrder];
727 
728  calc_one_theta<float, kOrder>(wx, theta);
729 #pragma unroll
730  for (int i=0;i < kOrder;i++) {
731  // The x theta matrix has the atom as the fastest moving index and the point as the slowest moving index:
732  //
733  // atom 0 point 0, atom 1 point 0, atom 2 point 0, ...
734  // atom 0 point 1, atom 1 point 1, atom 2 point 1, ...
735  // ...
736  //
737  const int idx = 0 * (kOrder * kThetaStride) + i * kThetaStride + threadIdx.x;
738  s_theta[idx] = q*theta[i];
739  }
740 
741  calc_one_theta<float, kOrder>(wy, theta);
742 #pragma unroll
743  for (int i=0;i < kOrder;i++) {
744  // Each thread will process 2 grid points in the Y dimension, and we want that data to be consecutive in
745  // shared memory to allow for vector loads:
746  //
747  // atom 0 point 0, atom 0 point 4, atom 1 point 0, atom 1 point 1, ...
748  // atom 0 point 1, atom 0 point 5, atom 1 point 1, atom 1 point 5, ...
749  // ...
750  //
751  const int indexInFloat2 = i / 4;
752  const int float2Index = i % 4;
753  const int idx = 1 * (kOrder * kThetaStride) + 2 * float2Index * kThetaStride + 2 * threadIdx.x + indexInFloat2;
754  s_theta[idx] = theta[i];
755  }
756 
757  calc_one_theta<float, kOrder>(wz, theta);
758 #pragma unroll
759  for (int i=0;i < kOrder;i++) {
760  // The z theta values use the same layout as the y theta values since each thread will
761  // process 2 grid points in the Z dimension
762  const int indexInFloat2 = i / 4;
763  const int float2Index = i % 4;
764  const int idx = 2 * (kOrder * kThetaStride) + 2 * float2Index * kThetaStride + 2 * threadIdx.x + indexInFloat2;
765  s_theta[idx] = theta[i];
766  }
767  }
768  __syncthreads();
769 
770  //
771  // Compute the grid contributions and store in shared memory with all threads processing a single atom concurrently
772  //
773 
774  // Each thread block will compute a 8x8x8 grid of charges with 128 threads. Each thread will
775  // compute a 1x2x2 non-contiguous sub-grid based on the layout of the theta described above.
776  // For example, thread 0 will compute (0,0,0), (0,4,0), (0,0,4), (0,4,4)
777  const int xOffset = threadIdx.x % kOrder;
778  const int yPartialOffset = (threadIdx.x / kOrder) % (kOrder / 2);
779  const int zPartialOffset = threadIdx.x / (kOrder * (kOrder / 2));
780 
781  constexpr int kYItersPerThread = 2;
782  constexpr int kZItersPerThread = 2;
783  constexpr int kYValPerIter = 4;
784  constexpr int kZValPerIter = 4;
785 
786  const int atomsToProcess = min(kNumThreads, numAtoms - iter * kNumThreads);
787  const int outerIters = ((atomsToProcess) / 4) * 4;
788  int thetaIndexOuter = 0;
789 
790  // The loop over atoms is broken into chunks of 4 to enable loading in multiple atom's x theta data.
791  // Each warp will only need to load in 8 distinct x theta values, so some of the bytes are wasted. By
792  // loading in the x theta data for 4 atoms at once, we can avoid this.
793  for ( ; thetaIndexOuter < outerIters; thetaIndexOuter += 4) {
794  char4 r_global_indices[4];
795  float r_x_val[4];
796 
797 #pragma unroll
798  for (int thetaIndexInner = 0; thetaIndexInner < 4; thetaIndexInner++) {
799  const int thetaIndex = thetaIndexOuter + thetaIndexInner;
800  // The compiler should auto vectorize these loads
801  r_global_indices[thetaIndexInner] = s_indices[thetaIndex];
802  r_x_val[thetaIndexInner] = s_theta[0 * (kOrder * kThetaStride) + xOffset * kThetaStride + thetaIndex];
803  }
804 
805 #pragma unroll
806  for (int thetaIndexInner = 0; thetaIndexInner < 4; thetaIndexInner++) {
807  const int thetaIndex = thetaIndexOuter + thetaIndexInner;
808 
809  const int global_idx_x = r_global_indices[thetaIndexInner].x;
810  const int global_idx_y = r_global_indices[thetaIndexInner].y;
811  const int global_idx_z = r_global_indices[thetaIndexInner].z;
812 
813  const int shared_idx_x = global_idx_x + xOffset;
814  const float x_val = r_x_val[thetaIndexInner];
815 
816 
817  int r_shared_idx_y[kYItersPerThread];
818  float r_grid_val_xy[kYItersPerThread];
819 #pragma unroll
820  for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
821  const int yOffset = 2 * yPartialOffset;
822  r_shared_idx_y[yIter] = global_idx_y + yPartialOffset + yIter * kYValPerIter;
823  // The compiler should auto vectorize these loads
824  const float y_val = s_theta[1 * (kOrder * kThetaStride) + yOffset * kThetaStride + 2 * thetaIndex + yIter];
825  r_grid_val_xy[yIter] = x_val * y_val;
826  }
827 
828  int r_shared_idx_z[kZItersPerThread];
829  float r_z_val[kZItersPerThread];
830 #pragma unroll
831  for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
832  const int zOffset = 2 * zPartialOffset;
833  r_z_val[zIter] = s_theta[2 * (kOrder * kThetaStride) + zOffset * kThetaStride + 2 * thetaIndex + zIter];
834  r_shared_idx_z[zIter] = global_idx_z + zPartialOffset + zIter * kZValPerIter;
835  }
836 
837 #pragma unroll
838  for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
839 #pragma unroll
840  for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
841  const float grid_val = r_grid_val_xy[yIter] * r_z_val[zIter];
842  const int shared_idx = shared_idx_x + kPatchGridDimPad * (r_shared_idx_y[yIter] + r_shared_idx_z[zIter] * kPatchGridDim);
843  s_grid[shared_idx] += grid_val;
844  }
845  }
846  __syncthreads();
847  }
848  }
849 
850  // Compute for the remaining atoms
851  for (int thetaIndex = thetaIndexOuter ; thetaIndex < atomsToProcess; thetaIndex++) {
852  const int global_idx_x = s_indices[thetaIndex].x;
853  const int global_idx_y = s_indices[thetaIndex].y;
854  const int global_idx_z = s_indices[thetaIndex].z;
855 
856  const int shared_idx_x = global_idx_x + xOffset;
857  const float x_val = s_theta[0 * (kOrder * kThetaStride) + xOffset * kThetaStride + thetaIndex];
858 
859  int r_shared_idx_y[kYItersPerThread];
860  float r_grid_val_xy[kYItersPerThread];
861 #pragma unroll
862  for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
863  const int yOffset = 2 * yPartialOffset;
864  r_shared_idx_y[yIter] = global_idx_y + yPartialOffset + yIter * kYValPerIter;
865  const float y_val = s_theta[1 * (kOrder * kThetaStride) + yOffset * kThetaStride + 2 * thetaIndex + yIter];
866  r_grid_val_xy[yIter] = x_val * y_val;
867  }
868 
869  int r_shared_idx_z[kZItersPerThread];
870  float r_z_val[kZItersPerThread];
871 #pragma unroll
872  for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
873  const int zOffset = 2 * zPartialOffset;
874  r_z_val[zIter] = s_theta[2 * (kOrder * kThetaStride) + zOffset * kThetaStride + 2 * thetaIndex + zIter];
875  r_shared_idx_z[zIter] = global_idx_z + zPartialOffset + zIter * kZValPerIter;
876  }
877 
878 #pragma unroll
879  for (int yIter = 0; yIter < kYItersPerThread; yIter++) {
880 #pragma unroll
881  for (int zIter = 0; zIter < kZItersPerThread; zIter++) {
882  const float grid_val = r_grid_val_xy[yIter] * r_z_val[zIter];
883  const int shared_idx = shared_idx_x + kPatchGridDimPad * (r_shared_idx_y[yIter] + r_shared_idx_z[zIter] * kPatchGridDim);
884  s_grid[shared_idx] += grid_val;
885  }
886  }
887  __syncthreads();
888  }
889  }
890  //
891  // Write Shared grid to global memory
892  //
893  const int indexInWarp = threadIdx.x % 32;
894  const int warpIndex = threadIdx.x / 32;
895  const int numWarps = blockDim.x / 32;
896  const bool isActive = (indexInWarp < patchGridDim.x);
897  if (!isActive) continue;
898 
899  const int shared_idx_x = indexInWarp;
900  int global_idx_x = shared_idx_x + patchGridOffset.x;
901  if (global_idx_x >= nfftx) global_idx_x -= nfftx;
902  else if (global_idx_x < 0) global_idx_x += nfftx;
903 
904 #pragma unroll 1
905  for (int yIter = warpIndex; yIter < patchGridDim.y; yIter += numWarps) {
906  const int shared_idx_y = yIter;
907  int global_idx_y = shared_idx_y + patchGridOffset.y;
908  if (global_idx_y >= nffty) global_idx_y -= nffty;
909  else if (global_idx_y < 0) global_idx_y += nffty;
910 
911  for (int zIter = 0; zIter < patchGridDim.z; zIter += 1) {
912  const int shared_idx_z = zIter;
913  int global_idx_z = shared_idx_z + patchGridOffset.z;
914  if (global_idx_z >= nfftz) global_idx_z -= nfftz;
915  else if (global_idx_z < 0) global_idx_z += nfftz;
916 
917  const int shared_idx = shared_idx_x + kPatchGridDimPad * (shared_idx_y + shared_idx_z * kPatchGridDim);
918  const int global_idx = global_idx_x + xdim * (global_idx_y + ysize * global_idx_z);
919 
920  atomicAdd(data + global_idx, s_grid[shared_idx]);
921  s_grid[shared_idx] = 0.0f;
922  }
923  }
924  }
925 }
926 
927 
928 //-----------------------------------------------------------------------------------------
929 // Generic version can not be used
930 template <typename T> __forceinline__ __device__
931 void gather_force_store(const float fx, const float fy, const float fz,
932  const int stride, const int pos,
933  T* force) {
934 }
935 
936 // Template specialization for "float"
937 template <> __forceinline__ __device__
938 void gather_force_store<float>(const float fx, const float fy, const float fz,
939  const int stride, const int pos,
940  float* force) {
941  // Store into non-strided float XYZ array
942  force[pos] = fx;
943  force[pos+stride] = fy;
944  force[pos+stride*2] = fz;
945 }
946 
947 // Template specialization for "float3"
948 template <> __forceinline__ __device__
949 void gather_force_store<float3>(const float fx, const float fy, const float fz,
950  const int stride, const int pos,
951  float3* force) {
952  // Store into non-strided "float3" array
953 #ifdef NAMD_HIP
954  // Workaround: unlike CUDA, HIP-hcc has sizeof(float3) != sizeof(CudaForce) (and == sizeof(float4))
955  // TODO-HIP: Remove when https://github.com/ROCm-Developer-Tools/HIP/issues/706 is fixed
956  reinterpret_cast<float*>(force)[pos * 3 + 0] = fx;
957  reinterpret_cast<float*>(force)[pos * 3 + 1] = fy;
958  reinterpret_cast<float*>(force)[pos * 3 + 2] = fz;
959 #else
960  force[pos].x = fx;
961  force[pos].y = fy;
962  force[pos].z = fz;
963 #endif
964 }
965 //-----------------------------------------------------------------------------------------
966 
967 // Per atom data structure for the gather_force -kernels
968 template <typename T, int order>
969 struct gather_t {
970  int ix;
971  int iy;
972  int iz;
973  T charge;
974  T thetax[order];
975  T thetay[order];
976  T thetaz[order];
977  T dthetax[order];
978  T dthetay[order];
979  T dthetaz[order];
980  float f1;
981  float f2;
982  float f3;
983 };
984 
985 //
986 // Gathers forces from the grid
987 // blockDim.x = Number of atoms each block loads
988 // blockDim.x*blockDim.y = Total number of threads per block
989 //
990 template <typename CT, typename FT, int order, bool periodicY, bool periodicZ>
991 __global__ void gather_force(const float4 *xyzq, const int ncoord,
992  const int nfftx, const int nffty, const int nfftz,
993  const int xsize, const int ysize, const int zsize,
994  const int xdim, const int y00, const int z00,
995  // const float recip1, const float recip2, const float recip3,
996  const float* data, // NOTE: data is used for loads when __CUDA_ARCH__ >= 350
997 #ifdef NAMD_CUDA
998  const cudaTextureObject_t gridTexObj,
999 #endif
1000  const int stride,
1001  FT *force) {
1002 
1003  const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
1004 
1005  // Shared memory
1006  __shared__ gather_t<CT, order> shmem[32];
1007 
1008  const int pos = blockIdx.x*blockDim.x + threadIdx.x;
1009  const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
1010 
1011  // Load atom data into shared memory
1012  if (pos < pos_end && threadIdx.y == 0) {
1013 
1014  float4 xyzqi = xyzq[pos];
1015  float x = xyzqi.x;
1016  float y = xyzqi.y;
1017  float z = xyzqi.z;
1018  float q = xyzqi.w;
1019 
1020  float frx = ((float)nfftx)*x;
1021  float fry = ((float)nffty)*y;
1022  float frz = ((float)nfftz)*z;
1023 
1024  int frxi = (int)frx;
1025  int fryi = (int)fry;
1026  int frzi = (int)frz;
1027 
1028  float wx = frx - (float)frxi;
1029  float wy = fry - (float)fryi;
1030  float wz = frz - (float)frzi;
1031 
1032  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
1033  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
1034 
1035  shmem[threadIdx.x].ix = frxi;
1036  shmem[threadIdx.x].iy = fryi - y00;
1037  shmem[threadIdx.x].iz = frzi - z00;
1038  shmem[threadIdx.x].charge = q;
1039 
1040  float3 theta_tmp[order];
1041  float3 dtheta_tmp[order];
1042  calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
1043 
1044 #pragma unroll
1045  for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
1046 
1047 #pragma unroll
1048  for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
1049 
1050 #pragma unroll
1051  for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
1052 
1053 #pragma unroll
1054  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
1055 
1056 #pragma unroll
1057  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
1058 
1059 #pragma unroll
1060  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
1061 
1062  }
1063  BLOCK_SYNC;
1064 
1065  // We divide the order x order x order cube into 8 sub-cubes.
1066  // These sub-cubes are taken care by a single thread
1067  // The size of the sub-cubes is:
1068  // order=4 : 2x2x2
1069  // order=6 : 3x3x3
1070  // order=8 : 4x4x4
1071  const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
1072  // Calculate the starting index on the sub-cube for this thread
1073  // tid = 0...63
1074  const int t = (tid % 8); // sub-cube index (0...7)
1075  // t = (tx0 + ty0*2 + tz0*4)/nsc
1076  // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
1077  const int tz0 = (t / 4)*nsc;
1078  const int ty0 = ((t / 2) % 2)*nsc;
1079  const int tx0 = (t % 2)*nsc;
1080 
1081  //
1082  // Calculate forces for 32 atoms. We have 32*2 = 64 threads
1083  // Loop is iterated 4 times:
1084  // (iterations)
1085  // Threads 0...7 = atoms 0, 8, 16, 24
1086  // Threads 8...15 = atoms 1, 9, 17, 25
1087  // Threads 16...31 = atoms 2, 10, 18, 26
1088  // ...
1089  // Threads 56...63 = atoms 7, 15, 23, 31
1090  //
1091 
1092  int base = tid/8;
1093  const int base_end = pos_end - blockIdx.x*blockDim.x;
1094 
1095  // Make sure to mask out any threads that are not running the loop.
1096  // This will happen if the number of atoms is not a multiple of 32.
1097 #if defined(NAMD_HIP)
1098  WarpMask warp_mask = NAMD_WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
1099 #else
1100  WarpMask warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
1101 #endif
1102 
1103  while (base < base_end) {
1104 
1105  float f1 = 0.0f;
1106  float f2 = 0.0f;
1107  float f3 = 0.0f;
1108  int ix0 = shmem[base].ix;
1109  int iy0 = shmem[base].iy;
1110  int iz0 = shmem[base].iz;
1111 
1112  // Each thread calculates a nsc x nsc x nsc sub-cube
1113 #pragma unroll
1114  for (int i=0;i < nsc*nsc*nsc;i++) {
1115  int tz = tz0 + (i/(nsc*nsc));
1116  int ty = ty0 + ((i/nsc) % nsc);
1117  int tx = tx0 + (i % nsc);
1118 
1119  int ix = ix0 + tx;
1120  int iy = iy0 + ty;
1121  int iz = iz0 + tz;
1122  if (ix >= nfftx) ix -= nfftx;
1123 
1124  if (periodicY && (iy >= nffty)) iy -= nffty;
1125  if (!periodicY && (iy < 0 || iy >= ysize)) continue;
1126 
1127  if (periodicZ && (iz >= nfftz)) iz -= nfftz;
1128  if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
1129 
1130  int ind = ix + (iy + iz*ysize)*xdim;
1131 
1132 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
1133  float q0 = __ldg(&data[ind]);
1134 #else
1135  float q0 = tex1Dfetch<float>(gridTexObj, ind);
1136 #endif
1137  float thx0 = shmem[base].thetax[tx];
1138  float thy0 = shmem[base].thetay[ty];
1139  float thz0 = shmem[base].thetaz[tz];
1140  float dthx0 = shmem[base].dthetax[tx];
1141  float dthy0 = shmem[base].dthetay[ty];
1142  float dthz0 = shmem[base].dthetaz[tz];
1143  f1 += dthx0 * thy0 * thz0 * q0;
1144  f2 += thx0 * dthy0 * thz0 * q0;
1145  f3 += thx0 * thy0 * dthz0 * q0;
1146  }
1147 
1148  //-------------------------
1149 
1150  // Reduce
1151  const int i = threadIdx.x & 7;
1152 
1153  f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
1154  f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
1155  f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
1156 
1157  f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
1158  f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
1159  f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
1160 
1161  f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
1162  f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
1163  f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
1164 
1165  if (i == 0) {
1166  shmem[base].f1 = f1;
1167  shmem[base].f2 = f2;
1168  shmem[base].f3 = f3;
1169  }
1170 
1171  base += 8;
1172 #if defined(NAMD_HIP)
1173  warp_mask = NAMD_WARP_BALLOT(warp_mask, (base < base_end) );
1174 #else
1175  warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
1176 #endif
1177  }
1178 
1179  // Write forces
1180  BLOCK_SYNC;
1181  if (pos < pos_end && threadIdx.y == 0) {
1182  float f1 = shmem[threadIdx.x].f1;
1183  float f2 = shmem[threadIdx.x].f2;
1184  float f3 = shmem[threadIdx.x].f3;
1185  float q = -shmem[threadIdx.x].charge; //*ccelec_float;
1186  // float fx = q*recip1*f1*nfftx;
1187  // float fy = q*recip2*f2*nffty;
1188  // float fz = q*recip3*f3*nfftz;
1189  float fx = q*f1*nfftx;
1190  float fy = q*f2*nffty;
1191  float fz = q*f3*nfftz;
1192  gather_force_store<FT>(fx, fy, fz, stride, pos, force);
1193  }
1194 
1195 }
1196 
1197 template<typename CT, typename FT, int kPatchGridDim>
1198 __global__ void
1199 __launch_bounds__(PatchLevelPmeData::kNumThreads, 1)
1200 gather_force_patch_8th( const int numPatches,
1201  const int3 patchGridDim,
1202  const CudaLocalRecord* localRecords,
1203  const int3* patchGridOffsets,
1204  const float4 *xyzq,
1205  const int nfftx, const int nffty, const int nfftz,
1206  const int xsize, const int ysize, const int zsize,
1207  const int xdim, const int y00, const int z00,
1208  const float* data,
1209  const int stride,
1210  FT *force
1211 ) {
1212  // Define local copies of constexpr variables
1213  constexpr int kNumThreads = PatchLevelPmeData::kNumThreads;
1214  constexpr int kThetaStride = kNumThreads + PatchLevelPmeData::kThetaPad;
1215  constexpr int kDims = PatchLevelPmeData::kDim;
1216  constexpr int kPatchGridDimPad = PatchLevelPmeData::kPatchGridDim;
1217  constexpr int kOrder = 8;
1218 
1219  // Setup shared memory buffers
1220  extern __shared__ float sharedBuffer[];
1221  float2* s_x_theta = (float2*) sharedBuffer;
1222  float* s_grid = (float*) &sharedBuffer[2 * kOrder * kThetaStride];
1223 
1224  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1225  const int numAtoms = localRecords[patchIndex].numAtoms;
1226  const int offset = localRecords[patchIndex].bufferOffset;
1227  const int3 patchGridOffset = patchGridOffsets[patchIndex];
1228 
1229  //
1230  // Load grid into shared memory
1231  //
1232  const int indexInWarp = threadIdx.x % 32;
1233  const int warpIndex = threadIdx.x / 32;
1234  const int numWarps = blockDim.x / 32;
1235  const bool isActive = (indexInWarp < patchGridDim.x);
1236 
1237  const int shared_idx_x = indexInWarp;
1238  int global_idx_x = shared_idx_x + patchGridOffset.x;
1239  if (global_idx_x >= nfftx) global_idx_x -= nfftx;
1240  else if (global_idx_x < 0) global_idx_x += nfftx;
1241 
1242 #pragma unroll 1
1243  for (int yIter = warpIndex; yIter < patchGridDim.y; yIter += numWarps) {
1244  const int shared_idx_y = yIter;
1245  int global_idx_y = shared_idx_y + patchGridOffset.y;
1246  if (global_idx_y >= nffty) global_idx_y -= nffty;
1247  else if (global_idx_y < 0) global_idx_y += nffty;
1248 
1249  for (int zIter = 0; zIter < patchGridDim.z; zIter += 1) {
1250  const int shared_idx_z = zIter;
1251  int global_idx_z = shared_idx_z + patchGridOffset.z;
1252  if (global_idx_z >= nfftz) global_idx_z -= nfftz;
1253  else if (global_idx_z < 0) global_idx_z += nfftz;
1254 
1255  const int shared_idx = shared_idx_x + kPatchGridDimPad * (shared_idx_y + shared_idx_z * kPatchGridDim);
1256  const int global_idx = global_idx_x + xdim * (global_idx_y + ysize * global_idx_z);
1257 
1258  if (isActive) {
1259  s_grid[shared_idx] = data[global_idx];
1260  }
1261  }
1262  }
1263  __syncthreads();
1264 
1265  // Iterate over the atoms in chunks of size equal to the number of threads. For small system, we can
1266  // assign multiple thread blocks per patch in the Y dimension to increase the amount of parallelism
1267  //
1268  // Each thread will compute the forces on one atom; this simplifies the kernel since it doesn't need
1269  // to reduce any partial sums of forces.
1270  const int numIters = (numAtoms - 1 + kNumThreads) / kNumThreads;
1271  for (int iter = blockIdx.y; iter < numIters; iter += gridDim.y) {
1272  const int atomIndex = iter * blockDim.x + threadIdx.x;
1273 
1274  if (atomIndex < numAtoms) {
1275  gather_t<CT, kOrder> atomData;
1276 
1277  float4 xyzqi = xyzq[atomIndex + offset];
1278  // The boundary conditions are handled when the grid is loaded into shared memory, so we
1279  // just need to shift the solutions here
1280  float x = xyzqi.x + 0.5f;
1281  float y = xyzqi.y + 0.5f;
1282  float z = xyzqi.z + 0.5f;
1283  float q = xyzqi.w;
1284 
1285  float frx = ((float)nfftx)*x;
1286  float fry = ((float)nffty)*y;
1287  float frz = ((float)nfftz)*z;
1288 
1289  int frxi = (int)floorf(frx);
1290  int fryi = (int)floorf(fry);
1291  int frzi = (int)floorf(frz);
1292 
1293  float wx = frx - (float)frxi;
1294  float wy = fry - (float)fryi;
1295  float wz = frz - (float)frzi;
1296 
1297  atomData.ix = frxi - patchGridOffset.x;
1298  atomData.iy = fryi - patchGridOffset.y;
1299  atomData.iz = frzi - patchGridOffset.z;
1300  atomData.charge = q;
1301 
1302  float3 theta_tmp[kOrder];
1303  float3 dtheta_tmp[kOrder];
1304  calc_theta_dtheta<float, float3, kOrder>(wx, wy, wz, theta_tmp, dtheta_tmp);
1305 
1306 #pragma unroll
1307  for (int i=0;i < kOrder;i++) {
1308  // x theta values are stored in shared memory to reduce register pressure
1309  s_x_theta[i * kThetaStride + threadIdx.x].x = theta_tmp[i].x;
1310  s_x_theta[i * kThetaStride + threadIdx.x].y = dtheta_tmp[i].x;
1311  atomData.thetay[i] = theta_tmp[i].y;
1312  atomData.dthetay[i] = dtheta_tmp[i].y;
1313  atomData.thetaz[i] = theta_tmp[i].z;
1314  atomData.dthetaz[i] = dtheta_tmp[i].z;
1315  }
1316 
1317  // Compute Forces
1318  float f1 = 0.0f;
1319  float f2 = 0.0f;
1320  float f3 = 0.0f;
1321 
1322 #pragma unroll 1
1323  for (int xIter = 0; xIter < kOrder; xIter++) {
1324  const int xOffset = xIter;
1325  // To reduce register pressure, the x theta values are stored in shared memory
1326  float thx0 = s_x_theta[xOffset * kThetaStride + threadIdx.x].x;
1327  float dthx0 = s_x_theta[xOffset * kThetaStride + threadIdx.x].y;
1328 
1329 #pragma unroll
1330  for (int yIter = 0; yIter < kOrder; yIter++) {
1331 #pragma unroll
1332  for (int zIter = 0; zIter < kOrder; zIter++) {
1333  const int yOffset = yIter;
1334  const int zOffset = zIter;
1335 
1336  const int shared_idx_x = atomData.ix + xOffset;
1337  const int shared_idx_y = atomData.iy + yIter;
1338  const int shared_idx_z = atomData.iz + zIter;
1339  const int shared_idx = shared_idx_x + kPatchGridDimPad *
1340  (shared_idx_y + shared_idx_z * kPatchGridDim);
1341  // This load is essentially doing random accesses to shared memory, so
1342  // it results in a significant amount of bank conflicts
1343  const float grid_val = s_grid[shared_idx];
1344 
1345  float thy0 = atomData.thetay[yOffset];
1346  float thz0 = atomData.thetaz[zOffset];
1347  float dthy0 = atomData.dthetay[yOffset];
1348  float dthz0 = atomData.dthetaz[zOffset];
1349 
1350  f1 += dthx0 * thy0 * thz0 * grid_val;
1351  f2 += thx0 * dthy0 * thz0 * grid_val;
1352  f3 += thx0 * thy0 * dthz0 * grid_val;
1353  }
1354  }
1355  }
1356 
1357  const float fx = -1.0f * atomData.charge * f1 * nfftx;
1358  const float fy = -1.0f * atomData.charge * f2 * nffty;
1359  const float fz = -1.0f * atomData.charge * f3 * nfftz;
1360  gather_force_store<FT>(fx, fy, fz, stride, atomIndex + offset, force);
1361  }
1362  }
1363  __syncthreads();
1364  }
1365 }
1366 
1367 const int TILEDIM = 32;
1368 const int TILEROWS = 8;
1369 
1370 template <typename T>
1371 __device__ __forceinline__
1372 void transpose_xyz_yzx_device(
1373  const int x_in, const int y_in, const int z_in,
1374  const int x_out, const int y_out,
1375  const int nx, const int ny, const int nz,
1376  const int xsize_in, const int ysize_in,
1377  const int ysize_out, const int zsize_out,
1378  const T* data_in, T* data_out) {
1379 
1380  // Shared memory
1381  __shared__ T tile[TILEDIM][TILEDIM+1];
1382 
1383  // Read (x,y) data_in into tile (shared memory)
1384  for (int j=0;j < TILEDIM;j += TILEROWS)
1385  if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
1386  tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
1387 
1388  BLOCK_SYNC;
1389 
1390  // Write (y,x) tile into data_out
1391  const int z_out = z_in;
1392  for (int j=0;j < TILEDIM;j += TILEROWS)
1393  if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
1394  data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
1395 }
1396 
1397 //
1398 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1399 //
1400 template <typename T>
1401 __global__ void transpose_xyz_yzx_kernel(
1402  const int nx, const int ny, const int nz,
1403  const int xsize_in, const int ysize_in,
1404  const int ysize_out, const int zsize_out,
1405  const T* data_in, T* data_out) {
1406 
1407  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1408  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
1409  int z_in = blockIdx.z + threadIdx.z;
1410 
1411  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1412  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
1413 
1414  transpose_xyz_yzx_device<T>(
1415  x_in, y_in, z_in,
1416  x_out, y_out,
1417  nx, ny, nz,
1418  xsize_in, ysize_in,
1419  ysize_out, zsize_out,
1420  data_in, data_out);
1421 
1422 /*
1423  // Shared memory
1424  __shared__ T tile[TILEDIM][TILEDIM+1];
1425 
1426  int x = blockIdx.x * TILEDIM + threadIdx.x;
1427  int y = blockIdx.y * TILEDIM + threadIdx.y;
1428  int z = blockIdx.z + threadIdx.z;
1429 
1430  // Read (x,y) data_in into tile (shared memory)
1431  for (int j=0;j < TILEDIM;j += TILEROWS)
1432  if ((x < nx) && (y + j < ny) && (z < nz))
1433  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
1434 
1435  BLOCK_SYNC;
1436 
1437  // Write (y,x) tile into data_out
1438  x = blockIdx.x * TILEDIM + threadIdx.y;
1439  y = blockIdx.y * TILEDIM + threadIdx.x;
1440  for (int j=0;j < TILEDIM;j += TILEROWS)
1441  if ((x + j < nx) && (y < ny) && (z < nz))
1442  data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
1443 */
1444 }
1445 
1446 //
1447 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1448 // Batch index bi is encoded in blockIdx.z, where
1449 // blockIdx.z = 0...nz-1 are for batch 1
1450 // blockIdx.z = nz...2*nz-1 are for batch 2
1451 // ...
1452 // gridDim.z = nz*numBatches
1453 //
1454 template <typename T>
1455 __global__ void batchTranspose_xyz_yzx_kernel(
1456  const TransposeBatch<T>* batches,
1457  const int ny, const int nz,
1458  const int xsize_in, const int ysize_in) {
1459 
1460  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1461  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
1462  int z_in = (blockIdx.z % nz) + threadIdx.z;
1463 
1464  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1465  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
1466 
1467  int bi = blockIdx.z/nz;
1468 
1469  TransposeBatch<T> batch = batches[bi];
1470  int nx = batch.nx;
1471  int ysize_out = batch.ysize_out;
1472  int zsize_out = batch.zsize_out;
1473  T* data_in = batch.data_in;
1474  T* data_out = batch.data_out;
1475 
1476  transpose_xyz_yzx_device<T>(
1477  x_in, y_in, z_in,
1478  x_out, y_out,
1479  nx, ny, nz,
1480  xsize_in, ysize_in,
1481  ysize_out, zsize_out,
1482  data_in, data_out);
1483 
1484 }
1485 
1486 /*
1487 //
1488 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1489 //
1490 template <typename T>
1491 __forceinline__ __device__
1492 void transpose_xyz_yzx_dev(
1493  const int blockz,
1494  const int nx, const int ny, const int nz,
1495  const int xsize_in, const int ysize_in,
1496  const int xsize_out, const int ysize_out,
1497  const T* data_in, T* data_out) {
1498 
1499  // Shared memory
1500  __shared__ T tile[TILEDIM][TILEDIM+1];
1501 
1502  int x = blockIdx.x * TILEDIM + threadIdx.x;
1503  int y = blockIdx.y * TILEDIM + threadIdx.y;
1504  // int z = blockIdx.z + threadIdx.z;
1505  int z = blockz + threadIdx.z;
1506 
1507  // Read (x,y) data_in into tile (shared memory)
1508  for (int j=0;j < TILEDIM;j += TILEROWS)
1509  if ((x < nx) && (y + j < ny) && (z < nz))
1510  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
1511 
1512  BLOCK_SYNC;
1513 
1514  // Write (y,x) tile into data_out
1515  x = blockIdx.x * TILEDIM + threadIdx.y;
1516  y = blockIdx.y * TILEDIM + threadIdx.x;
1517  for (int j=0;j < TILEDIM;j += TILEROWS)
1518  if ((x + j < nx) && (y < ny) && (z < nz))
1519  data_out[y + (z + (x+j)*ysize_out)*xsize_out] = tile[threadIdx.x][threadIdx.y + j];
1520 
1521 }
1522 
1523 //
1524 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
1525 // (nx, ny, nz) = size of the transposed volume
1526 // (xsize_in, ysize_in, zsize_in) = size of the input data
1527 // into nblock memory blocks
1528 //
1529 template <typename T>
1530 __global__ void transpose_xyz_yzx_kernel(
1531  const int nblock,
1532  const int nx, const int ny, const int nz,
1533  const int xsize_in, const int ysize_in,
1534  const int xsize_out, const int ysize_out,
1535  const T* data_in, T* data_out) {
1536 
1537  const int iblock = blockIdx.z/nz;
1538 
1539  if (iblock < nblock) {
1540  transpose_xyz_yzx_dev(blockIdx.z % nz, nx, ny, nz,
1541  xsize_in, ysize_in, xsize_out, ysize_out,
1542  data_in, data_out);
1543  }
1544 
1545 }
1546 */
1547 
1548 template <typename T>
1549 __device__ __forceinline__
1550 void transpose_xyz_zxy_device(
1551  const int x_in, const int y_in, const int z_in,
1552  const int x_out, const int z_out,
1553  const int nx, const int ny, const int nz,
1554  const int xsize_in, const int ysize_in,
1555  const int zsize_out, const int xsize_out,
1556  const T* data_in, T* data_out) {
1557 
1558  // Shared memory
1559  __shared__ T tile[TILEDIM][TILEDIM+1];
1560 
1561  // Read (x,z) data_in into tile (shared memory)
1562  for (int k=0;k < TILEDIM;k += TILEROWS)
1563  if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
1564  tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
1565 
1566  BLOCK_SYNC;
1567 
1568  // Write (z,x) tile into data_out
1569  const int y_out = y_in;
1570  for (int k=0;k < TILEDIM;k += TILEROWS)
1571  if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
1572  data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
1573 }
1574 
1575 //
1576 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(z, x, y)
1577 //
1578 template <typename T>
1579 __global__ void transpose_xyz_zxy_kernel(
1580  const int nx, const int ny, const int nz,
1581  const int xsize_in, const int ysize_in,
1582  const int zsize_out, const int xsize_out,
1583  const T* data_in, T* data_out) {
1584 
1585  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1586  int y_in = blockIdx.z + threadIdx.z;
1587  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1588 
1589  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1590  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1591 
1592  transpose_xyz_zxy_device<T>(
1593  x_in, y_in, z_in, x_out, z_out,
1594  nx, ny, nz,
1595  xsize_in, ysize_in,
1596  zsize_out, xsize_out,
1597  data_in, data_out);
1598 
1599 }
1600 
1601 //
1602 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(z, x, y)
1603 // Batch index bi is encoded in blockIdx.z, where
1604 // blockIdx.z = 0...ny-1 are for batch 1
1605 // blockIdx.z = ny...2*ny-1 are for batch 2
1606 // ...
1607 // gridDim.z = ny*numBatches
1608 //
1609 template <typename T>
1610 __global__ void batchTranspose_xyz_zxy_kernel(
1611  const TransposeBatch<T>* batches,
1612  const int ny, const int nz,
1613  const int xsize_in, const int ysize_in) {
1614 
1615  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1616  int y_in = (blockIdx.z % ny) + threadIdx.z;
1617  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1618 
1619  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1620  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1621 
1622  int bi = blockIdx.z/ny;
1623 
1624  TransposeBatch<T> batch = batches[bi];
1625  int nx = batch.nx;
1626  int zsize_out = batch.zsize_out;
1627  int xsize_out = batch.xsize_out;
1628  T* data_in = batch.data_in;
1629  T* data_out = batch.data_out;
1630 
1631  transpose_xyz_zxy_device<T>(
1632  x_in, y_in, z_in, x_out, z_out,
1633  nx, ny, nz,
1634  xsize_in, ysize_in,
1635  zsize_out, xsize_out,
1636  data_in, data_out);
1637 
1638 }
1639 
1640 // XXX should be defined in a centralized place
1641 #undef SQRT_PI
1642 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
1643 
1644 template<int BLOCK_SIZE>
1645 __global__ void selfEnergyKernel(
1646  double *selfEnergy,
1647  const float4 *atoms,
1648  int natoms,
1649  double ewaldcof
1650  )
1651 {
1652  double qq=0; // sum q^2 over all threads
1653  int i = threadIdx.x + blockIdx.x * blockDim.x;
1654  if (i < natoms) {
1655  double q = atoms[i].w;
1656  qq = q*q;
1657  }
1658 
1659  typedef cub::BlockReduce<double, BLOCK_SIZE> BlockReduce;
1660  __shared__ typename BlockReduce::TempStorage temp_storage;
1661  qq = BlockReduce(temp_storage).Sum(qq);
1662  __syncthreads();
1663 
1664 
1665  if (threadIdx.x == 0) {
1666  // each charge already scaled by sqrt(COULOMB * scaling * dielectric_1)
1667  // finish scaling self energy term
1668  double c = -ewaldcof * (1.0 / SQRT_PI);
1669  atomicAdd(selfEnergy, c*qq);
1670  }
1671 }
1672 
1673 double compute_selfEnergy(
1674  double *d_selfEnergy,
1675  const float4 *d_atoms,
1676  int natoms,
1677  double ewaldcof,
1678  cudaStream_t stream)
1679 {
1680  double selfEnergy = 0;
1681  const int block = 128;
1682  const int grid = (natoms + block - 1) / block;
1683  selfEnergyKernel<block><<<grid, block, 0, stream>>>(d_selfEnergy,
1684  d_atoms, natoms, ewaldcof);
1685  cudaCheck(cudaStreamSynchronize(stream));
1686  copy_DtoH<double>(d_selfEnergy, &selfEnergy, 1, stream);
1687  cudaCheck(cudaStreamSynchronize(stream));
1688  return selfEnergy;
1689 }
1690 
1691 //#######################################################################################
1692 //#######################################################################################
1693 //#######################################################################################
1694 
1695 void spread_charge(const float4 *atoms, const int numAtoms,
1696  const int nfftx, const int nffty, const int nfftz,
1697  const int xsize, const int ysize, const int zsize,
1698  const int xdim, const int y00, const int z00,
1699  const bool periodicY, const bool periodicZ,
1700  float* data, const int order, cudaStream_t stream) {
1701 
1702  dim3 nthread, nblock;
1703 
1704  switch(order) {
1705  case 4:
1706  nthread.x = 32;
1707  nthread.y = 4;
1708  nthread.z = 1;
1709  nblock.x = (numAtoms - 1)/nthread.x + 1;
1710  nblock.y = 1;
1711  nblock.z = 1;
1712  if (periodicY && periodicZ)
1713  spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1714  (atoms, numAtoms,
1715  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1716  else if (periodicY)
1717  spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1718  (atoms, numAtoms,
1719  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1720  else if (periodicZ)
1721  spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1722  (atoms, numAtoms,
1723  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1724  else
1725  spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1726  (atoms, numAtoms,
1727  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1728  break;
1729 
1730  case 6:
1731  nthread.x = 32;
1732  nthread.y = 7;
1733  nthread.z = 1;
1734  nblock.x = (numAtoms - 1)/nthread.x + 1;
1735  nblock.y = 1;
1736  nblock.z = 1;
1737  if (periodicY && periodicZ)
1738  spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1739  (atoms, numAtoms,
1740  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1741  else if (periodicY)
1742  spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1743  (atoms, numAtoms,
1744  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1745  else if (periodicZ)
1746  spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1747  (atoms, numAtoms,
1748  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1749  else
1750  spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1751  (atoms, numAtoms,
1752  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1753  break;
1754 
1755  case 8:
1756  nthread.x = 32;
1757  nthread.y = 16;
1758  nthread.z = 1;
1759  nblock.x = (numAtoms - 1)/nthread.x + 1;
1760  nblock.y = 1;
1761  nblock.z = 1;
1762  if (periodicY && periodicZ)
1763  spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1764  (atoms, numAtoms,
1765  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1766  else if (periodicY)
1767  spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1768  (atoms, numAtoms,
1769  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1770  else if (periodicZ)
1771  spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1772  (atoms, numAtoms,
1773  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1774  else
1775  spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1776  (atoms, numAtoms,
1777  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1778  break;
1779 
1780  default:
1781  char str[128];
1782  sprintf(str, "spread_charge, order %d not implemented",order);
1783  cudaNAMD_bug(str);
1784  }
1785  cudaCheck(cudaGetLastError());
1786 
1787 }
1788 
1789 
1790 // JM new version of spread_charge routine
1791 void spread_charge_v2(
1792  const PatchLevelPmeData patchLevelPmeData,
1793  const float4 *atoms, const int numAtoms,
1794  const int nfftx, const int nffty, const int nfftz,
1795  const float nfftx_f, const float nffty_f, const float nfftz_f,
1796  const int order3,
1797  const int xsize, const int ysize, const int zsize,
1798  const int xdim, const int y00, const int z00,
1799  const bool periodicY, const bool periodicZ,
1800  float* data, const int order, cudaStream_t stream) {
1801 
1802  dim3 nthread, nblock;
1803 
1804  if (patchLevelPmeData.compatible()) {
1805  const dim3 numBlocks = dim3(patchLevelPmeData.numPatches, 1, 1);
1806  const int sharedMemory = patchLevelPmeData.spreadChargeSharedBytes;
1807  cudaFuncSetAttribute(
1808  (void*)spread_charge_patch_8th<float, patchLevelPmeData.kPatchGridDim>,
1809  cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemory);
1810  spread_charge_patch_8th<float, patchLevelPmeData.kPatchGridDim>
1811  <<<numBlocks, PatchLevelPmeData::kNumThreads, sharedMemory, stream>>>(
1812  patchLevelPmeData.numPatches, patchLevelPmeData.patchGridDim,
1813  patchLevelPmeData.localRecords, patchLevelPmeData.d_patchGridOffsets,
1814  atoms, nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f,
1815  xsize, ysize, zsize, xdim, y00, z00, data
1816  );
1817  cudaCheck(cudaGetLastError());
1818 
1819  } else {
1820 
1821  switch(order) {
1822  case 4:
1823  nthread.x = 32;
1824  nthread.y = 4;
1825  nthread.z = 1;
1826  nblock.x = (numAtoms - 1)/nthread.x + 1;
1827  nblock.y = 1;
1828  nblock.z = 1;
1829  if (periodicY && periodicZ)
1830  spread_charge_kernel_v2<float, 4, true, true> <<< nblock, nthread, 0, stream >>>
1831  (atoms, numAtoms,
1832  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1833  else if (periodicY)
1834  spread_charge_kernel_v2<float, 4, true, false> <<< nblock, nthread, 0, stream >>>
1835  (atoms, numAtoms,
1836  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1837  else if (periodicZ)
1838  spread_charge_kernel_v2<float, 4, false, true> <<< nblock, nthread, 0, stream >>>
1839  (atoms, numAtoms,
1840  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1841  else
1842  spread_charge_kernel_v2<float, 4, false, false> <<< nblock, nthread, 0, stream >>>
1843  (atoms, numAtoms,
1844  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1845  break;
1846 
1847  case 6:
1848  nthread.x = WARPSIZE;
1849  nthread.y = order3 / WARPSIZE;
1850  nthread.z = 1;
1851  nblock.x = (numAtoms - 1)/nthread.x + 1;
1852  nblock.y = 1;
1853  nblock.z = 1;
1854  if (periodicY && periodicZ)
1855  spread_charge_kernel_v2<float, 6, true, true> <<< nblock, nthread, 0, stream >>>
1856  (atoms, numAtoms,
1857  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1858  else if (periodicY)
1859  spread_charge_kernel_v2<float, 6, true, false> <<< nblock, nthread, 0, stream >>>
1860  (atoms, numAtoms,
1861  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1862  else if (periodicZ)
1863  spread_charge_kernel_v2<float, 6, false, true> <<< nblock, nthread, 0, stream >>>
1864  (atoms, numAtoms,
1865  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1866  else
1867  spread_charge_kernel_v2<float, 6, false, false> <<< nblock, nthread, 0, stream >>>
1868  (atoms, numAtoms,
1869  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1870  break;
1871 
1872  case 8:
1873  nthread.x = WARPSIZE;
1874  nthread.y = order3 / WARPSIZE;
1875  nthread.z = 1;
1876  nblock.x = (numAtoms - 1)/nthread.x + 1;
1877  nblock.y = 1;
1878  nblock.z = 1;
1879  if (periodicY && periodicZ)
1880  spread_charge_kernel_v2<float, 8, true, true> <<< nblock, nthread, 0, stream >>>
1881  (atoms, numAtoms,
1882  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1883  else if (periodicY)
1884  spread_charge_kernel_v2<float, 8, true, false> <<< nblock, nthread, 0, stream >>>
1885  (atoms, numAtoms,
1886  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1887  else if (periodicZ)
1888  spread_charge_kernel_v2<float, 8, false, true> <<< nblock, nthread, 0, stream >>>
1889  (atoms, numAtoms,
1890  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1891  else
1892  spread_charge_kernel_v2<float, 8, false, false> <<< nblock, nthread, 0, stream >>>
1893  (atoms, numAtoms,
1894  nfftx, nffty, nfftz, nfftx_f, nffty_f, nfftz_f, xsize, ysize, zsize, xdim, y00, z00, data);
1895  break;
1896 
1897  default:
1898  char str[128];
1899  sprintf(str, "spread_charge, order %d not implemented",order);
1900  cudaNAMD_bug(str);
1901  }
1902  // cudaCheck(cudaGetLastError());
1903  }
1904 }
1905 
1906 void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3,
1907  const int size1, const int size2, const int size3, const double kappa,
1908  const float recip1x, const float recip1y, const float recip1z,
1909  const float recip2x, const float recip2y, const float recip2z,
1910  const float recip3x, const float recip3y, const float recip3z,
1911  const double volume,
1912  const float* prefac1, const float* prefac2, const float* prefac3,
1913  const int k2_00, const int k3_00,
1914  const bool doEnergyVirial, double* energy, double* virial, float2* data,
1915  cudaStream_t stream) {
1916 #ifdef NAMD_CUDA
1917  int nthread = 1024;
1918  int nblock = 64;
1919 #else
1920  int nthread = 256;
1921  int nblock = 256;
1922 #endif
1923 
1924  int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1925  if (doEnergyVirial) {
1926  shmem_size = max(shmem_size, (int)((nthread/WARPSIZE)*sizeof(RecipVirial_t)));
1927  }
1928 
1929  float piv_inv = (float)(1.0/(M_PI*volume));
1930  float fac = (float)(M_PI*M_PI/(kappa*kappa));
1931 
1932  if (doEnergyVirial) {
1933  if (orderXYZ) {
1934  scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>>
1935  (nfft1, nfft2, nfft3, size1, size2, size3,
1936  recip1x, recip1y, recip1z,
1937  recip2x, recip2y, recip2z,
1938  recip3x, recip3y, recip3z,
1939  prefac1, prefac2, prefac3,
1940  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1941  } else {
1942  scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>>
1943  (nfft1, nfft2, nfft3, size1, size2, size3,
1944  recip1x, recip1y, recip1z,
1945  recip2x, recip2y, recip2z,
1946  recip3x, recip3y, recip3z,
1947  prefac1, prefac2, prefac3,
1948  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1949  }
1950  } else {
1951  if (orderXYZ) {
1952  scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>>
1953  (nfft1, nfft2, nfft3, size1, size2, size3,
1954  recip1x, recip1y, recip1z,
1955  recip2x, recip2y, recip2z,
1956  recip3x, recip3y, recip3z,
1957  prefac1, prefac2, prefac3,
1958  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1959  } else {
1960  scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>>
1961  (nfft1, nfft2, nfft3, size1, size2, size3,
1962  recip1x, recip1y, recip1z,
1963  recip2x, recip2y, recip2z,
1964  recip3x, recip3y, recip3z,
1965  prefac1, prefac2, prefac3,
1966  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1967  }
1968  }
1969  cudaCheck(cudaGetLastError());
1970 
1971 }
1972 
1973 void gather_force(
1974  const PatchLevelPmeData patchLevelPmeData,
1975  const float4 *atoms, const int numAtoms,
1976  // const float recip11, const float recip22, const float recip33,
1977  const int nfftx, const int nffty, const int nfftz,
1978  const int xsize, const int ysize, const int zsize,
1979  const int xdim, const int y00, const int z00,
1980  const bool periodicY, const bool periodicZ,
1981  const float* data, const int order, float3* force,
1982 #ifdef NAMD_CUDA
1983  const cudaTextureObject_t gridTexObj,
1984 #endif
1985  cudaStream_t stream) {
1986 
1987  if (patchLevelPmeData.compatible()) {
1988  const dim3 numBlocks = dim3(patchLevelPmeData.numPatches, 1, 1);
1989  const int sharedMemory = patchLevelPmeData.spreadChargeSharedBytes;
1990 
1991  cudaFuncSetAttribute(
1992  (void*)gather_force_patch_8th<float, float3, PatchLevelPmeData::kPatchGridDim>,
1993  cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemory);
1994  gather_force_patch_8th<float, float3, PatchLevelPmeData::kPatchGridDim>
1995  <<<numBlocks, PatchLevelPmeData::kNumThreads, sharedMemory, stream>>>(
1996  patchLevelPmeData.numPatches, patchLevelPmeData.patchGridDim,
1997  patchLevelPmeData.localRecords, patchLevelPmeData.d_patchGridOffsets,
1998  atoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data,
1999  1, force
2000  );
2001 
2002  cudaCheck(cudaGetLastError());
2003  } else {
2004  dim3 nthread(32, 2, 1);
2005  dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
2006  // dim3 nblock(npatch, 1, 1);
2007 
2008  switch(order) {
2009  case 4:
2010  if (periodicY && periodicZ)
2011  gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>>
2012  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2013  // recip11, recip22, recip33,
2014  data,
2015 #ifdef NAMD_CUDA
2016  gridTexObj,
2017 #endif
2018  1, force);
2019  else if (periodicY)
2020  gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>>
2021  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2022  // recip11, recip22, recip33,
2023  data,
2024 #ifdef NAMD_CUDA
2025  gridTexObj,
2026 #endif
2027  1, force);
2028  else if (periodicZ)
2029  gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>>
2030  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2031  // recip11, recip22, recip33,
2032  data,
2033 #ifdef NAMD_CUDA
2034  gridTexObj,
2035 #endif
2036  1, force);
2037  else
2038  gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>>
2039  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2040  // recip11, recip22, recip33,
2041  data,
2042 #ifdef NAMD_CUDA
2043  gridTexObj,
2044 #endif
2045  1, force);
2046  break;
2047 
2048  case 6:
2049  if (periodicY && periodicZ)
2050  gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>>
2051  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2052  // recip11, recip22, recip33,
2053  data,
2054 #ifdef NAMD_CUDA
2055  gridTexObj,
2056 #endif
2057  1, force);
2058  else if (periodicY)
2059  gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>>
2060  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2061  // recip11, recip22, recip33,
2062  data,
2063 #ifdef NAMD_CUDA
2064  gridTexObj,
2065 #endif
2066  1, force);
2067  else if (periodicZ)
2068  gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>>
2069  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2070  // recip11, recip22, recip33,
2071  data,
2072 #ifdef NAMD_CUDA
2073  gridTexObj,
2074 #endif
2075  1, force);
2076  else
2077  gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>>
2078  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2079  // recip11, recip22, recip33,
2080  data,
2081 #ifdef NAMD_CUDA
2082  gridTexObj,
2083 #endif
2084  1, force);
2085  break;
2086 
2087  case 8:
2088  if (periodicY && periodicZ)
2089  gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>>
2090  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2091  // recip11, recip22, recip33,
2092  data,
2093 #ifdef NAMD_CUDA
2094  gridTexObj,
2095 #endif
2096  1, force);
2097  else if (periodicY)
2098  gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>>
2099  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2100  // recip11, recip22, recip33,
2101  data,
2102 #ifdef NAMD_CUDA
2103  gridTexObj,
2104 #endif
2105  1, force);
2106  else if (periodicZ)
2107  gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>>
2108  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2109  // recip11, recip22, recip33,
2110  data,
2111 #ifdef NAMD_CUDA
2112  gridTexObj,
2113 #endif
2114  1, force);
2115  else
2116  gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>>
2117  (atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
2118  // recip11, recip22, recip33,
2119  data,
2120 #ifdef NAMD_CUDA
2121  gridTexObj,
2122 #endif
2123  1, force);
2124  break;
2125 
2126  default:
2127  char str[128];
2128  sprintf(str, "gather_force, order %d not implemented",order);
2129  cudaNAMD_bug(str);
2130  }
2131  cudaCheck(cudaGetLastError());
2132  }
2133 }
2134 
2135 //
2136 // Transpose
2137 //
2138 void transpose_xyz_yzx(
2139  const int nx, const int ny, const int nz,
2140  const int xsize_in, const int ysize_in,
2141  const int ysize_out, const int zsize_out,
2142  const float2* data_in, float2* data_out, cudaStream_t stream) {
2143 
2144  dim3 numthread(TILEDIM, TILEROWS, 1);
2145  dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
2146 
2147  transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
2148  (nx, ny, nz, xsize_in, ysize_in,
2149  ysize_out, zsize_out,
2150  data_in, data_out);
2151 
2152  cudaCheck(cudaGetLastError());
2153 }
2154 
2155 //
2156 // Batched transpose
2157 //
2158 void batchTranspose_xyz_yzx(
2159  const int numBatches, TransposeBatch<float2>* batches,
2160  const int max_nx, const int ny, const int nz,
2161  const int xsize_in, const int ysize_in, cudaStream_t stream) {
2162 
2163  dim3 numthread(TILEDIM, TILEROWS, 1);
2164  dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
2165 
2166  batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>>
2167  (batches, ny, nz, xsize_in, ysize_in);
2168 
2169  cudaCheck(cudaGetLastError());
2170 }
2171 
2172 //
2173 // Transpose
2174 //
2175 void transpose_xyz_zxy(
2176  const int nx, const int ny, const int nz,
2177  const int xsize_in, const int ysize_in,
2178  const int zsize_out, const int xsize_out,
2179  const float2* data_in, float2* data_out, cudaStream_t stream) {
2180 
2181  dim3 numthread(TILEDIM, TILEROWS, 1);
2182  dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
2183 
2184  transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
2185  (nx, ny, nz, xsize_in, ysize_in,
2186  zsize_out, xsize_out,
2187  data_in, data_out);
2188 
2189  cudaCheck(cudaGetLastError());
2190 }
2191 
2192 //
2193 // Batched transpose
2194 //
2195 void batchTranspose_xyz_zxy(
2196  const int numBatches, TransposeBatch<float2>* batches,
2197  const int max_nx, const int ny, const int nz,
2198  const int xsize_in, const int ysize_in,
2199  cudaStream_t stream) {
2200 
2201  dim3 numthread(TILEDIM, TILEROWS, 1);
2202  dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
2203 
2204  batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>>
2205  (batches, ny, nz, xsize_in, ysize_in);
2206 
2207  cudaCheck(cudaGetLastError());
2208 }
2209 
2210 template <int block_size, bool alchDecouple>
2211 __global__ void calcSelfEnergyFEPKernel(double* d_selfEnergy, double* d_selfEnergy_FEP, const float4* d_atoms, const int* d_partition, const int len, const double ewaldcof, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down) {
2212  const int i = blockIdx.x * blockDim.x + threadIdx.x;
2213  double q = 0;
2214  double qq1 = 0;
2215  double qq2 = 0;
2216  double scaleLambda1 = 0;
2217  double scaleLambda2 = 0;
2218  // NOTE: len is NOT the full length of d_atoms array, it's just the length of the first grid
2219  // d_atoms always has at least 2 grids when the code reaches here since we are doing alchemical transformations
2220  if (i < len) {
2221  switch (d_partition[i]) {
2222  case 0: {
2223  scaleLambda1 = 1.0f;
2224  scaleLambda2 = 1.0f;
2225  q = d_atoms[i].w; // or d_atoms_grid2[i].w
2226  break;
2227  }
2228  case 1: {
2229  scaleLambda1 = lambda1Up; // lambda1 up
2230  scaleLambda2 = lambda2Up; // lambda2 up
2231  q = d_atoms[i].w;
2232  break;
2233  }
2234  case 2: {
2235  scaleLambda1 = lambda1Down; // lambda1 down
2236  scaleLambda2 = lambda2Down; // lambda2 down
2237  // the charges in the #1 grid are already scaled to 0
2238  // so I fetch the charges from the second grid
2239  q = d_atoms[i + len].w;
2240  break;
2241  }
2242  }
2243  if (alchDecouple) {
2244  qq1 = q * q;
2245  qq2 = q * q;
2246  } else {
2247  qq1 = q * q * scaleLambda1;
2248  qq2 = q * q * scaleLambda2;
2249  }
2250  }
2251  typedef cub::BlockReduce<double, block_size> BlockReduce;
2252  __shared__ typename BlockReduce::TempStorage temp_storage;
2253  qq1 = BlockReduce(temp_storage).Sum(qq1);
2254  __syncthreads();
2255 
2256  qq2 = BlockReduce(temp_storage).Sum(qq2);
2257  __syncthreads();
2258  if (threadIdx.x == 0) {
2259  const double c = -ewaldcof * (1.0 / SQRT_PI);
2260  atomicAdd(d_selfEnergy, c*qq1);
2261  atomicAdd(d_selfEnergy_FEP, c*qq2);
2262  }
2263 }
2264 
2265 
2266 void calcSelfEnergyFEPWrapper(double* d_selfEnergy, double* d_selfEnergy_FEP, double& h_selfEnergy, double& h_selfEnergyFEP, const float4* d_atoms, const int* d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda2Up, const double lambda1Down, const double lambda2Down, cudaStream_t stream) {
2267  const int block_size = 128;
2268  const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
2269  if (alchDecouple) {
2270  calcSelfEnergyFEPKernel<block_size, true><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_FEP, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda2Up, lambda1Down, lambda2Down);
2271  } else {
2272  calcSelfEnergyFEPKernel<block_size, false><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_FEP, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda2Up, lambda1Down, lambda2Down);
2273  }
2274  copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
2275  copy_DtoH<double>(d_selfEnergy_FEP, &h_selfEnergyFEP, 1, stream);
2276  cudaCheck(cudaStreamSynchronize(stream));
2277 }
2278 
2279 template <int block_size, bool alchDecouple>
2280 __global__ void calcSelfEnergyTIKernel(double* d_selfEnergy, double* d_selfEnergy_TI_1, double* d_selfEnergy_TI_2, const float4* d_atoms, const int* d_partition, const int len, const double ewaldcof, const double lambda1Up, const double lambda1Down) {
2281  const int i = blockIdx.x * blockDim.x + threadIdx.x;
2282  double q = 0;
2283  double qq = 0.0;
2284  double qq1 = 0.0;
2285  double qq2 = 0.0;
2286  double factor_ti_1 = 0.0;
2287  double factor_ti_2 = 0.0;
2288  double elecLambda1 = 0.0;
2289  if (i < len) {
2290  switch (d_partition[i]) {
2291  case 0: {
2292  elecLambda1 = 1.0;
2293  factor_ti_1 = 0.0;
2294  factor_ti_2 = 0.0;
2295  q = d_atoms[i].w; // or d_atoms_grid2[i].w
2296  break;
2297  }
2298  case 1: {
2299  elecLambda1 = lambda1Up;
2300  factor_ti_1 = 1.0;
2301  factor_ti_2 = 0.0;
2302  q = d_atoms[i].w;
2303  break;
2304  }
2305  case 2: {
2306  elecLambda1 = lambda1Down;
2307  factor_ti_1 = 0.0;
2308  factor_ti_2 = 1.0;
2309  q = d_atoms[i + len].w;
2310  break;
2311  }
2312  }
2313  if (alchDecouple) {
2314  qq = q * q;
2315  } else {
2316  qq = q * q * elecLambda1;
2317  qq1 = q * q * factor_ti_1;
2318  qq2 = q * q * factor_ti_2;
2319  }
2320  }
2321  typedef cub::BlockReduce<double, block_size> BlockReduce;
2322  __shared__ typename BlockReduce::TempStorage temp_storage;
2323  qq = BlockReduce(temp_storage).Sum(qq);
2324  __syncthreads();
2325  qq1 = BlockReduce(temp_storage).Sum(qq1);
2326  __syncthreads();
2327  qq2 = BlockReduce(temp_storage).Sum(qq2);
2328  __syncthreads();
2329  if (threadIdx.x == 0) {
2330  const double c = -ewaldcof * (1.0 / SQRT_PI);
2331  atomicAdd(d_selfEnergy, c*qq);
2332  atomicAdd(d_selfEnergy_TI_1, c*qq1);
2333  atomicAdd(d_selfEnergy_TI_2, c*qq2);
2334  }
2335  __syncthreads();
2336 }
2337 
2338 void calcSelfEnergyTIWrapper(double* d_selfEnergy, double* d_selfEnergy_TI_1, double* d_selfEnergy_TI_2, double& h_selfEnergy, double& h_selfEnergy_TI_1, double& h_selfEnergy_TI_2, const float4* d_atoms, const int* d_partition, const int num_atoms, const double ewaldcof, const bool alchDecouple, const double lambda1Up, const double lambda1Down, cudaStream_t stream) {
2339  const int block_size = 128;
2340  const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
2341  if (alchDecouple) {
2342  calcSelfEnergyTIKernel<block_size, true><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda1Down);
2343  } else {
2344  calcSelfEnergyTIKernel<block_size, false><<<num_blocks, block_size, 0, stream>>>(d_selfEnergy, d_selfEnergy_TI_1, d_selfEnergy_TI_2, d_atoms, d_partition, num_atoms, ewaldcof, lambda1Up, lambda1Down);
2345  }
2346  copy_DtoH<double>(d_selfEnergy, &h_selfEnergy, 1, stream);
2347  copy_DtoH<double>(d_selfEnergy_TI_1, &h_selfEnergy_TI_1, 1, stream);
2348  copy_DtoH<double>(d_selfEnergy_TI_2, &h_selfEnergy_TI_2, 1, stream);
2349  cudaCheck(cudaStreamSynchronize(stream));
2350 }
2351 
2352 template <int NUM_ARRAYS>
2353 __global__ void scaleAndMergeForceKernel(float3* forces, const float* factors, const int num_atoms) {
2354  const int i = blockIdx.x * blockDim.x + threadIdx.x;
2355  if (i >= num_atoms) return;
2356  switch (NUM_ARRAYS) {
2357  case 2: {
2358  forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1];
2359  forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1];
2360  forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1];
2361  break;
2362  }
2363  case 3: {
2364  forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2];
2365  forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2];
2366  forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2];
2367  break;
2368  }
2369  case 4: {
2370  forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2] + forces[i + 3 * num_atoms].x * factors[3];
2371  forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2] + forces[i + 3 * num_atoms].y * factors[3];
2372  forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2] + forces[i + 3 * num_atoms].z * factors[3];
2373  break;
2374  }
2375  case 5: {
2376  forces[i].x = forces[i].x * factors[0] + forces[i + num_atoms].x * factors[1] + forces[i + 2 * num_atoms].x * factors[2] + forces[i + 3 * num_atoms].x * factors[3] + forces[i + 4 * num_atoms].x * factors[4];
2377  forces[i].y = forces[i].y * factors[0] + forces[i + num_atoms].y * factors[1] + forces[i + 2 * num_atoms].y * factors[2] + forces[i + 3 * num_atoms].y * factors[3] + forces[i + 4 * num_atoms].y * factors[4];
2378  forces[i].z = forces[i].z * factors[0] + forces[i + num_atoms].z * factors[1] + forces[i + 2 * num_atoms].z * factors[2] + forces[i + 3 * num_atoms].z * factors[3] + forces[i + 4 * num_atoms].z * factors[4];
2379  break;
2380  }
2381  }
2382 }
2383 
2384 void scaleAndMergeForceWrapper(float3* forces, const float* factors, const size_t num_arrays, const int num_atoms, cudaStream_t stream) {
2385  const int block_size = 128;
2386  const int num_blocks = int(std::ceil(double(num_atoms) / block_size));
2387  switch (num_arrays) {
2388  case 2: scaleAndMergeForceKernel<2><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2389  case 3: scaleAndMergeForceKernel<3><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2390  case 4: scaleAndMergeForceKernel<4><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2391  case 5: scaleAndMergeForceKernel<5><<<num_blocks, block_size, 0, stream>>>(forces, factors, num_atoms); break;
2392  }
2393  cudaCheck(cudaGetLastError());
2394 }
2395 
2396 #endif // NAMD_CUDA
2397