NAMD
CudaPmeSolverUtilKernel.cu
Go to the documentation of this file.
1 #ifdef WIN32
2 #define _USE_MATH_DEFINES
3 #endif
4 #include <math.h>
5 #include <stdio.h>
6 #ifdef NAMD_CUDA
7 #include <cuda.h>
8 #endif
9 #ifdef NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #include "HipDefines.h"
12 #endif
13 #include "CudaUtils.h"
15 
16 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
17 // CCELEC is 1/ (4 pi eps ) in AKMA units, conversion from SI
18 // units: CCELEC = e*e*Na / (4*pi*eps*1Kcal*1A)
19 //
20 // parameter :: CCELEC=332.0636D0 ! old value of dubious origin
21 // parameter :: CCELEC=331.843D0 ! value from 1986-1987 CRC Handbook
22 // ! of Chemistry and Physics
23 // real(chm_real), parameter :: &
24 // CCELEC_amber = 332.0522173D0, &
25 // CCELEC_charmm = 332.0716D0 , &
26 // CCELEC_discover = 332.054D0 , &
27 // CCELEC_namd = 332.0636D0
28 //const double ccelec = 332.0636;
29 //const double half_ccelec = 0.5*ccelec;
30 //const float ccelec_float = 332.0636f;
31 
32 /*
33 // Structure into which virials are stored
34 // NOTE: This structure is only used for computing addresses
35 struct Virial_t {
36  double sforce_dp[27][3];
37  long long int sforce_fp[27][3];
38  double virmat[9];
39  // Energies start here ...
40 };
41 */
42 
43 // Local structure for scalar_sum -function for energy and virial reductions
44 struct RecipVirial_t {
45  double energy;
46  double virial[6];
47 };
48 
49 
50 __forceinline__ __device__ double expfunc(const double x) {
51  return exp(x);
52 }
53 
54 __forceinline__ __device__ float expfunc(const float x) {
55  return __expf(x);
56 }
57 
58 //
59 // Performs scalar sum on data(nfft1, nfft2, nfft3)
60 // T = float or double
61 // T2 = float2 or double2
62 //
63 template <typename T, typename T2, bool calc_energy_virial, bool orderXYZ, bool doOrtho>
64 __global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3,
65  const int size1, const int size2, const int size3,
66  const T recip1x, const T recip1y, const T recip1z,
67  const T recip2x, const T recip2y, const T recip2z,
68  const T recip3x, const T recip3y, const T recip3z,
69  const T* prefac1, const T* prefac2, const T* prefac3,
70  const T pi_ewald, const T piv_inv,
71  const int k2_00, const int k3_00,
72  T2* data,
73  double* __restrict__ energy_recip,
74  double* __restrict__ virial) {
75  // Shared memory required: sizeof(T)*(nfft1 + nfft2 + nfft3)
76  #ifdef NAMD_CUDA
77  extern __shared__ T sh_prefac[];
78  #else //NAMD_HIP
79  HIP_DYNAMIC_SHARED( T, sh_prefac)
80  #endif
81 
82  // Create pointers to shared memory
83  T* sh_prefac1 = (T *)&sh_prefac[0];
84  T* sh_prefac2 = (T *)&sh_prefac[nfft1];
85  T* sh_prefac3 = (T *)&sh_prefac[nfft1 + nfft2];
86 
87  // Calculate start position (k1, k2, k3) for each thread
88  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
89  int k3 = tid/(size1*size2);
90  tid -= k3*size1*size2;
91  int k2 = tid/size1;
92  int k1 = tid - k2*size1;
93 
94  // Starting position in data
95  int pos = k1 + (k2 + k3*size2)*size1;
96 
97  // Move (k2, k3) to the global coordinate (k1_00 = 0 since this is the pencil direction)
98  k2 += k2_00;
99  k3 += k3_00;
100 
101  // Calculate limits w.r.t. global coordinates
102  const int lim2 = size2 + k2_00;
103  const int lim3 = size3 + k3_00;
104 
105  // Calculate increments (k1_inc, k2_inc, k3_inc)
106  int tot_inc = blockDim.x*gridDim.x;
107  const int k3_inc = tot_inc/(size1*size2);
108  tot_inc -= k3_inc*size1*size2;
109  const int k2_inc = tot_inc/size1;
110  const int k1_inc = tot_inc - k2_inc*size1;
111 
112  // Set data[0] = 0 for the global (0,0,0)
113  if (k1 == 0 && k2 == 0 && k3 == 0) {
114  T2 zero;
115  zero.x = (T)0;
116  zero.y = (T)0;
117  data[0] = zero;
118  // Increment position
119  k1 += k1_inc;
120  pos += k1_inc;
121  if (k1 >= size1) {
122  k1 -= size1;
123  k2++;
124  }
125  k2 += k2_inc;
126  pos += k2_inc*size1;
127  if (k2 >= lim2) {
128  k2 -= size2;
129  k3++;
130  }
131  k3 += k3_inc;
132  pos += k3_inc*size1*size2;
133  }
134 
135  // Load prefac data into shared memory
136  {
137  int t = threadIdx.x;
138  while (t < nfft1) {
139  sh_prefac1[t] = prefac1[t];
140  t += blockDim.x;
141  }
142  t = threadIdx.x;
143  while (t < nfft2) {
144  sh_prefac2[t] = prefac2[t];
145  t += blockDim.x;
146  }
147  t = threadIdx.x;
148  while (t < nfft3) {
149  sh_prefac3[t] = prefac3[t];
150  t += blockDim.x;
151  }
152  }
153  BLOCK_SYNC;
154 
155  double energy = 0.0;
156  double virial0 = 0.0;
157  double virial1 = 0.0;
158  double virial2 = 0.0;
159  double virial3 = 0.0;
160  double virial4 = 0.0;
161  double virial5 = 0.0;
162 
163  // If nfft1 is odd, set nfft1_half to impossible value so that
164  // the second condition in "if ( (k1 == 0) || (k1 == nfft1_half) )"
165  // is never satisfied
166  const int nfft1_half = ((nfft1 & 1) == 0) ? nfft1/2 : -1;
167  const int nfft2_half = nfft2/2;
168  const int nfft3_half = nfft3/2;
169 
170  while (k3 < lim3) {
171 
172  T2 q = data[pos];
173 
174  int k2s = (k2 <= nfft2_half) ? k2 : k2 - nfft2;
175  int k3s = (k3 <= nfft3_half) ? k3 : k3 - nfft3;
176 
177  T m1, m2, m3;
178  if (doOrtho) {
179  m1 = recip1x*k1;
180  m2 = recip2y*k2s;
181  m3 = recip3z*k3s;
182  } else {
183  m1 = recip1x*k1 + recip2x*k2s + recip3x*k3s;
184  m2 = recip1y*k1 + recip2y*k2s + recip3y*k3s;
185  m3 = recip1z*k1 + recip2z*k2s + recip3z*k3s;
186  }
187 
188  T msq = m1*m1 + m2*m2 + m3*m3;
189  T msq_inv = ((T)1)/msq;
190 
191  T theta3 = sh_prefac1[k1]*sh_prefac2[k2]*sh_prefac3[k3];
192  T q2 = ((T)2)*(q.x*q.x + q.y*q.y)*theta3;
193  if ( (k1 == 0) || (k1 == nfft1_half) ) q2 *= ((T)0.5);
194  T xp3 = expfunc(-pi_ewald*msq)*piv_inv;
195  T C = xp3*msq_inv;
196  T theta = theta3*C;
197 
198  if (calc_energy_virial) {
199  T fac = q2*C;
200  T vir = ((T)2)*(pi_ewald + msq_inv);
201  energy += fac;
202  virial0 += (double)(fac*(vir*m1*m1 - ((T)1)));
203  virial1 += (double)(fac*vir*m1*m2);
204  virial2 += (double)(fac*vir*m1*m3);
205  virial3 += (double)(fac*(vir*m2*m2 - ((T)1)));
206  virial4 += (double)(fac*vir*m2*m3);
207  virial5 += (double)(fac*(vir*m3*m3 - ((T)1)));
208  }
209 
210  q.x *= theta;
211  q.y *= theta;
212  data[pos] = q;
213 
214  // Increment position
215  k1 += k1_inc;
216  pos += k1_inc;
217  if (k1 >= size1) {
218  k1 -= size1;
219  k2++;
220  }
221  k2 += k2_inc;
222  pos += k2_inc*size1;
223  if (k2 >= lim2) {
224  k2 -= size2;
225  k3++;
226  }
227  k3 += k3_inc;
228  pos += k3_inc*size2*size1;
229  }
230 
231  // Reduce energy and virial
232  if (calc_energy_virial) {
233  const int tid = threadIdx.x & (warpSize-1);
234  const int base = (threadIdx.x/warpSize);
235  volatile RecipVirial_t* sh_ev = (RecipVirial_t *)sh_prefac;
236  // Reduce within warps
237  for (int d=warpSize/2;d >= 1;d /= 2) {
238  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
239  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
240  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
241  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
242  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
243  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
244  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
245  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
246  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
247  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
248  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
249  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
250  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
251  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
252  }
253  // Reduce between warps
254  // NOTE: this BLOCK_SYNC is needed because we're using a single shared memory buffer
255  BLOCK_SYNC;
256  if (tid == 0) {
257  sh_ev[base].energy = energy;
258  sh_ev[base].virial[0] = virial0;
259  sh_ev[base].virial[1] = virial1;
260  sh_ev[base].virial[2] = virial2;
261  sh_ev[base].virial[3] = virial3;
262  sh_ev[base].virial[4] = virial4;
263  sh_ev[base].virial[5] = virial5;
264  }
265  BLOCK_SYNC;
266  if (base == 0) {
267  energy = (tid < blockDim.x/warpSize) ? sh_ev[tid].energy : 0.0;
268  virial0 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[0] : 0.0;
269  virial1 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[1] : 0.0;
270  virial2 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[2] : 0.0;
271  virial3 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[3] : 0.0;
272  virial4 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[4] : 0.0;
273  virial5 = (tid < blockDim.x/warpSize) ? sh_ev[tid].virial[5] : 0.0;
274  for (int d=warpSize/2;d >= 1;d /= 2) {
275  energy += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(energy), tid+d, WARPSIZE),
276  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(energy), tid+d, WARPSIZE));
277  virial0 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial0), tid+d, WARPSIZE),
278  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial0), tid+d, WARPSIZE));
279  virial1 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial1), tid+d, WARPSIZE),
280  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial1), tid+d, WARPSIZE));
281  virial2 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial2), tid+d, WARPSIZE),
282  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial2), tid+d, WARPSIZE));
283  virial3 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial3), tid+d, WARPSIZE),
284  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial3), tid+d, WARPSIZE));
285  virial4 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial4), tid+d, WARPSIZE),
286  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial4), tid+d, WARPSIZE));
287  virial5 += __hiloint2double(WARP_SHUFFLE(WARP_FULL_MASK, __double2hiint(virial5), tid+d, WARPSIZE),
288  WARP_SHUFFLE(WARP_FULL_MASK, __double2loint(virial5), tid+d, WARPSIZE));
289  }
290  }
291 
292  if (threadIdx.x == 0) {
293  atomicAdd(energy_recip, energy*0.5);
294  virial0 *= -0.5;
295  virial1 *= -0.5;
296  virial2 *= -0.5;
297  virial3 *= -0.5;
298  virial4 *= -0.5;
299  virial5 *= -0.5;
300  atomicAdd(&virial[0], virial0);
301  atomicAdd(&virial[1], virial1);
302  atomicAdd(&virial[2], virial2);
303  atomicAdd(&virial[3], virial3);
304  atomicAdd(&virial[4], virial4);
305  atomicAdd(&virial[5], virial5);
306  }
307 
308  }
309 
310 }
311 
312 template <typename T>
313 __forceinline__ __device__ void write_grid(const float val, const int ind,
314  T* data) {
315 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR >= 3) && (HIP_VERSION_MINOR > 3))
316  atomicAddNoRet(&data[ind], (T)val);
317 #else
318  atomicAdd(&data[ind], (T)val);
319 #endif
320 }
321 
322 //
323 // General version for any order
324 //
325 template <typename T, int order>
326 __forceinline__ __device__ void calc_one_theta(const T w, T *theta) {
327 
328  theta[order-1] = ((T)0);
329  theta[1] = w;
330  theta[0] = ((T)1) - w;
331 
332 #pragma unroll
333  for (int k=3;k <= order-1;k++) {
334  T div = ((T)1) / (T)(k-1);
335  theta[k-1] = div*w*theta[k-2];
336 #pragma unroll
337  for (int j=1;j <= k-2;j++) {
338  theta[k-j-1] = div*((w+j)*theta[k-j-2] + (k-j-w)*theta[k-j-1]);
339  }
340  theta[0] = div*(((T)1) - w)*theta[0];
341  }
342 
343  //--- one more recursion
344  T div = ((T)1) / (T)(order-1);
345  theta[order-1] = div*w*theta[order-2];
346 #pragma unroll
347  for (int j=1;j <= order-2;j++) {
348  theta[order-j-1] = div*((w+j)*theta[order-j-2] + (order-j-w)*theta[order-j-1]);
349  }
350 
351  theta[0] = div*(((T)1) - w)*theta[0];
352 }
353 
354 //
355 // Calculate theta and dtheta for general order bspline
356 //
357 template <typename T, typename T3, int order>
358 __forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta) {
359 
360  theta[order-1].x = ((T)0);
361  theta[order-1].y = ((T)0);
362  theta[order-1].z = ((T)0);
363  theta[1].x = wx;
364  theta[1].y = wy;
365  theta[1].z = wz;
366  theta[0].x = ((T)1) - wx;
367  theta[0].y = ((T)1) - wy;
368  theta[0].z = ((T)1) - wz;
369 
370 #pragma unroll
371  for (int k=3;k <= order-1;k++) {
372  T div = ((T)1) / (T)(k-1);
373  theta[k-1].x = div*wx*theta[k-2].x;
374  theta[k-1].y = div*wy*theta[k-2].y;
375  theta[k-1].z = div*wz*theta[k-2].z;
376 #pragma unroll
377  for (int j=1;j <= k-2;j++) {
378  theta[k-j-1].x = div*((wx + j)*theta[k-j-2].x + (k-j-wx)*theta[k-j-1].x);
379  theta[k-j-1].y = div*((wy + j)*theta[k-j-2].y + (k-j-wy)*theta[k-j-1].y);
380  theta[k-j-1].z = div*((wz + j)*theta[k-j-2].z + (k-j-wz)*theta[k-j-1].z);
381  }
382  theta[0].x = div*(((T)1) - wx)*theta[0].x;
383  theta[0].y = div*(((T)1) - wy)*theta[0].y;
384  theta[0].z = div*(((T)1) - wz)*theta[0].z;
385  }
386 
387  //--- perform standard b-spline differentiation
388  dtheta[0].x = -theta[0].x;
389  dtheta[0].y = -theta[0].y;
390  dtheta[0].z = -theta[0].z;
391 #pragma unroll
392  for (int j=2;j <= order;j++) {
393  dtheta[j-1].x = theta[j-2].x - theta[j-1].x;
394  dtheta[j-1].y = theta[j-2].y - theta[j-1].y;
395  dtheta[j-1].z = theta[j-2].z - theta[j-1].z;
396  }
397 
398  //--- one more recursion
399  T div = ((T)1) / (T)(order-1);
400  theta[order-1].x = div*wx*theta[order-2].x;
401  theta[order-1].y = div*wy*theta[order-2].y;
402  theta[order-1].z = div*wz*theta[order-2].z;
403 #pragma unroll
404  for (int j=1;j <= order-2;j++) {
405  theta[order-j-1].x = div*((wx + j)*theta[order-j-2].x + (order-j-wx)*theta[order-j-1].x);
406  theta[order-j-1].y = div*((wy + j)*theta[order-j-2].y + (order-j-wy)*theta[order-j-1].y);
407  theta[order-j-1].z = div*((wz + j)*theta[order-j-2].z + (order-j-wz)*theta[order-j-1].z);
408  }
409 
410  theta[0].x = div*(((T)1) - wx)*theta[0].x;
411  theta[0].y = div*(((T)1) - wy)*theta[0].y;
412  theta[0].z = div*(((T)1) - wz)*theta[0].z;
413 }
414 
415 //
416 // Spreads the charge on the grid. Calculates theta and dtheta on the fly
417 // blockDim.x = Number of atoms each block loads
418 // blockDim.y*blockDim.x/order3 = Number of atoms we spread at once
419 //
420 template <typename AT, int order, bool periodicY, bool periodicZ>
421 __global__ void
422 spread_charge_kernel(const float4 *xyzq, const int ncoord,
423  const int nfftx, const int nffty, const int nfftz,
424  const int xsize, const int ysize, const int zsize,
425  const int xdim, const int y00, const int z00,
426  AT* data) {
427 
428  // Shared memory use:
429  // order = 4: 1920 bytes
430  // order = 6: 2688 bytes
431  // order = 8: 3456 bytes
432  __shared__ int sh_ix[WARPSIZE];
433  __shared__ int sh_iy[WARPSIZE];
434  __shared__ int sh_iz[WARPSIZE];
435  __shared__ float sh_thetax[order*WARPSIZE];
436  __shared__ float sh_thetay[order*WARPSIZE];
437  __shared__ float sh_thetaz[order*WARPSIZE];
438 
439  // Process atoms pos to pos_end-1 (blockDim.x)
440  const unsigned int pos = blockIdx.x*blockDim.x + threadIdx.x;
441  const unsigned int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
442 
443  if (pos < pos_end && threadIdx.y == 0) {
444 
445  float4 xyzqi = xyzq[pos];
446  float x = xyzqi.x;
447  float y = xyzqi.y;
448  float z = xyzqi.z;
449  float q = xyzqi.w;
450 
451  float frx = ((float)nfftx)*x;
452  float fry = ((float)nffty)*y;
453  float frz = ((float)nfftz)*z;
454 
455  int frxi = (int)frx;
456  int fryi = (int)fry;
457  int frzi = (int)frz;
458 
459  float wx = frx - (float)frxi;
460  float wy = fry - (float)fryi;
461  float wz = frz - (float)frzi;
462 
463  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
464  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
465 
466  sh_ix[threadIdx.x] = frxi;
467  sh_iy[threadIdx.x] = fryi - y00;
468  sh_iz[threadIdx.x] = frzi - z00;
469 
470  float theta[order];
471 
472  calc_one_theta<float, order>(wx, theta);
473 #pragma unroll
474  for (int i=0;i < order;i++) sh_thetax[threadIdx.x*order + i] = q*theta[i];
475 
476  calc_one_theta<float, order>(wy, theta);
477 #pragma unroll
478  for (int i=0;i < order;i++) sh_thetay[threadIdx.x*order + i] = theta[i];
479 
480  calc_one_theta<float, order>(wz, theta);
481 #pragma unroll
482  for (int i=0;i < order;i++) sh_thetaz[threadIdx.x*order + i] = theta[i];
483 
484  }
485 
486  BLOCK_SYNC;
487 
488  // Grid point location, values of (ix0, iy0, iz0) are in range 0..order-1
489  // NOTE: Only tid=0...order*order*order-1 do any computation
490  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
491  const int tid = (threadIdx.x + threadIdx.y*blockDim.x) % order3; // 0...order3-1
492  const int x0 = tid % order;
493  const int y0 = (tid / order) % order;
494  const int z0 = tid / (order*order);
495 
496  // Loop over atoms pos..pos_end-1
497  int iadd = blockDim.x*blockDim.y/order3;
498  int i = (threadIdx.x + threadIdx.y*blockDim.x)/order3;
499  int iend = pos_end - blockIdx.x*blockDim.x;
500  for (;i < iend;i += iadd) {
501  int x = sh_ix[i] + x0;
502  int y = sh_iy[i] + y0;
503  int z = sh_iz[i] + z0;
504 
505  if (x >= nfftx) x -= nfftx;
506 
507  if (periodicY && (y >= nffty)) y -= nffty;
508  if (!periodicY && (y < 0 || y >= ysize)) continue;
509 
510  if (periodicZ && (z >= nfftz)) z -= nfftz;
511  if (!periodicZ && (z < 0 || z >= zsize)) continue;
512 
513  // Get position on the grid
514  int ind = x + xdim*(y + ysize*(z));
515 
516  // Here we unroll the 6x6x6 loop with 216 threads.
517  // NOTE: We use 7*32=224 threads to do this
518  // Calculate interpolated charge value and store it to global memory
519 
520  if (tid < order*order*order)
521  write_grid<AT>(sh_thetax[i*order+x0]*sh_thetay[i*order+y0]*sh_thetaz[i*order+z0], ind, data);
522  }
523 
524 }
525 
526 //-----------------------------------------------------------------------------------------
527 // Generic version can not be used
528 template <typename T> __forceinline__ __device__
529 void gather_force_store(const float fx, const float fy, const float fz,
530  const int stride, const int pos,
531  T* force) {
532 }
533 
534 // Template specialization for "float"
535 template <> __forceinline__ __device__
536 void gather_force_store<float>(const float fx, const float fy, const float fz,
537  const int stride, const int pos,
538  float* force) {
539  // Store into non-strided float XYZ array
540 #ifdef NAMD_HIP
541  // Workaround: unlike CUDA, HIP-hcc has sizeof(float3) != sizeof(CudaForce) (and == sizeof(float4))
542  // TODO-HIP: Remove when https://github.com/ROCm-Developer-Tools/HIP/issues/706 is fixed
543  reinterpret_cast<float*>(force)[pos * 3 + 0] = fx;
544  reinterpret_cast<float*>(force)[pos * 3 + 1] = fy;
545  reinterpret_cast<float*>(force)[pos * 3 + 2] = fz;
546 #else
547  force[pos] = fx;
548  force[pos+stride] = fy;
549  force[pos+stride*2] = fz;
550 #endif
551 }
552 
553 // Template specialization for "float3"
554 template <> __forceinline__ __device__
555 void gather_force_store<float3>(const float fx, const float fy, const float fz,
556  const int stride, const int pos,
557  float3* force) {
558  // Store into non-strided "float3" array
559 #ifdef NAMD_HIP
560  // Workaround: unlike CUDA, HIP-hcc has sizeof(float3) != sizeof(CudaForce) (and == sizeof(float4))
561  // TODO-HIP: Remove when https://github.com/ROCm-Developer-Tools/HIP/issues/706 is fixed
562  reinterpret_cast<float*>(force)[pos * 3 + 0] = fx;
563  reinterpret_cast<float*>(force)[pos * 3 + 1] = fy;
564  reinterpret_cast<float*>(force)[pos * 3 + 2] = fz;
565 #else
566  force[pos].x = fx;
567  force[pos].y = fy;
568  force[pos].z = fz;
569 #endif
570 }
571 //-----------------------------------------------------------------------------------------
572 
573 // Per atom data structure for the gather_force -kernels
574 template <typename T, int order>
575 struct gather_t {
576  int ix;
577  int iy;
578  int iz;
586  float f1;
587  float f2;
588  float f3;
589 };
590 
591 //
592 // Gathers forces from the grid
593 // blockDim.x = Number of atoms each block loads
594 // blockDim.x*blockDim.y = Total number of threads per block
595 //
596 template <typename CT, typename FT, int order, bool periodicY, bool periodicZ>
597 __global__ void gather_force(const float4 *xyzq, const int ncoord,
598  const int nfftx, const int nffty, const int nfftz,
599  const int xsize, const int ysize, const int zsize,
600  const int xdim, const int y00, const int z00,
601  // const float recip1, const float recip2, const float recip3,
602  const float* data, // NOTE: data is used for loads when __CUDA_ARCH__ >= 350
603 #ifdef NAMD_CUDA
604  const cudaTextureObject_t gridTexObj,
605 #endif
606  const int stride,
607  FT *force) {
608 
609  const int tid = threadIdx.x + threadIdx.y*blockDim.x; // 0...63
610 
611  // Shared memory
612  __shared__ gather_t<CT, order> shmem[32];
613 
614  const int pos = blockIdx.x*blockDim.x + threadIdx.x;
615  const int pos_end = min((blockIdx.x+1)*blockDim.x, ncoord);
616 
617  // Load atom data into shared memory
618  if (pos < pos_end && threadIdx.y == 0) {
619 
620  float4 xyzqi = xyzq[pos];
621  float x = xyzqi.x;
622  float y = xyzqi.y;
623  float z = xyzqi.z;
624  float q = xyzqi.w;
625 
626  float frx = ((float)nfftx)*x;
627  float fry = ((float)nffty)*y;
628  float frz = ((float)nfftz)*z;
629 
630  int frxi = (int)frx;
631  int fryi = (int)fry;
632  int frzi = (int)frz;
633 
634  float wx = frx - (float)frxi;
635  float wy = fry - (float)fryi;
636  float wz = frz - (float)frzi;
637 
638  if (!periodicY && y00 == 0 && fryi >= ysize) fryi -= nffty;
639  if (!periodicZ && z00 == 0 && frzi >= zsize) frzi -= nfftz;
640 
641  shmem[threadIdx.x].ix = frxi;
642  shmem[threadIdx.x].iy = fryi - y00;
643  shmem[threadIdx.x].iz = frzi - z00;
644  shmem[threadIdx.x].charge = q;
645 
646  float3 theta_tmp[order];
647  float3 dtheta_tmp[order];
648  calc_theta_dtheta<float, float3, order>(wx, wy, wz, theta_tmp, dtheta_tmp);
649 
650 #pragma unroll
651  for (int i=0;i < order;i++) shmem[threadIdx.x].thetax[i] = theta_tmp[i].x;
652 
653 #pragma unroll
654  for (int i=0;i < order;i++) shmem[threadIdx.x].thetay[i] = theta_tmp[i].y;
655 
656 #pragma unroll
657  for (int i=0;i < order;i++) shmem[threadIdx.x].thetaz[i] = theta_tmp[i].z;
658 
659 #pragma unroll
660  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetax[i] = dtheta_tmp[i].x;
661 
662 #pragma unroll
663  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetay[i] = dtheta_tmp[i].y;
664 
665 #pragma unroll
666  for (int i=0;i < order;i++) shmem[threadIdx.x].dthetaz[i] = dtheta_tmp[i].z;
667 
668  }
669  BLOCK_SYNC;
670 
671  // We divide the order x order x order cube into 8 sub-cubes.
672  // These sub-cubes are taken care by a single thread
673  // The size of the sub-cubes is:
674  // order=4 : 2x2x2
675  // order=6 : 3x3x3
676  // order=8 : 4x4x4
677  const int nsc = (order == 4) ? 2 : ((order == 6) ? 3 : 4);
678  // Calculate the starting index on the sub-cube for this thread
679  // tid = 0...63
680  const int t = (tid % 8); // sub-cube index (0...7)
681  // t = (tx0 + ty0*2 + tz0*4)/nsc
682  // (tx0, ty0, tz0) gives the starting index of the 3x3x3 sub-cube
683  const int tz0 = (t / 4)*nsc;
684  const int ty0 = ((t / 2) % 2)*nsc;
685  const int tx0 = (t % 2)*nsc;
686 
687  //
688  // Calculate forces for 32 atoms. We have 32*2 = 64 threads
689  // Loop is iterated 4 times:
690  // (iterations)
691  // Threads 0...7 = atoms 0, 8, 16, 24
692  // Threads 8...15 = atoms 1, 9, 17, 25
693  // Threads 16...31 = atoms 2, 10, 18, 26
694  // ...
695  // Threads 56...63 = atoms 7, 15, 23, 31
696  //
697 
698  int base = tid/8;
699  const int base_end = pos_end - blockIdx.x*blockDim.x;
700 
701  // Make sure to mask out any threads that are not running the loop.
702  // This will happen if the number of atoms is not a multiple of 32.
703  WarpMask warp_mask = WARP_BALLOT(WARP_FULL_MASK, (base < base_end) );
704 
705  while (base < base_end) {
706 
707  float f1 = 0.0f;
708  float f2 = 0.0f;
709  float f3 = 0.0f;
710  int ix0 = shmem[base].ix;
711  int iy0 = shmem[base].iy;
712  int iz0 = shmem[base].iz;
713 
714  // Each thread calculates a nsc x nsc x nsc sub-cube
715 #pragma unroll
716  for (int i=0;i < nsc*nsc*nsc;i++) {
717  int tz = tz0 + (i/(nsc*nsc));
718  int ty = ty0 + ((i/nsc) % nsc);
719  int tx = tx0 + (i % nsc);
720 
721  int ix = ix0 + tx;
722  int iy = iy0 + ty;
723  int iz = iz0 + tz;
724  if (ix >= nfftx) ix -= nfftx;
725 
726  if (periodicY && (iy >= nffty)) iy -= nffty;
727  if (!periodicY && (iy < 0 || iy >= ysize)) continue;
728 
729  if (periodicZ && (iz >= nfftz)) iz -= nfftz;
730  if (!periodicZ && (iz < 0 || iz >= zsize)) continue;
731 
732  int ind = ix + (iy + iz*ysize)*xdim;
733 
734 #if __CUDA_ARCH__ >= 350 || defined(NAMD_HIP)
735  float q0 = __ldg(&data[ind]);
736 #else
737  float q0 = tex1Dfetch<float>(gridTexObj, ind);
738 #endif
739  float thx0 = shmem[base].thetax[tx];
740  float thy0 = shmem[base].thetay[ty];
741  float thz0 = shmem[base].thetaz[tz];
742  float dthx0 = shmem[base].dthetax[tx];
743  float dthy0 = shmem[base].dthetay[ty];
744  float dthz0 = shmem[base].dthetaz[tz];
745  f1 += dthx0 * thy0 * thz0 * q0;
746  f2 += thx0 * dthy0 * thz0 * q0;
747  f3 += thx0 * thy0 * dthz0 * q0;
748  }
749 
750  //-------------------------
751 
752  // Reduce
753  const int i = threadIdx.x & 7;
754 
755  f1 += WARP_SHUFFLE(warp_mask, f1, i+4, 8);
756  f2 += WARP_SHUFFLE(warp_mask, f2, i+4, 8);
757  f3 += WARP_SHUFFLE(warp_mask, f3, i+4, 8);
758 
759  f1 += WARP_SHUFFLE(warp_mask, f1, i+2, 8);
760  f2 += WARP_SHUFFLE(warp_mask, f2, i+2, 8);
761  f3 += WARP_SHUFFLE(warp_mask, f3, i+2, 8);
762 
763  f1 += WARP_SHUFFLE(warp_mask, f1, i+1, 8);
764  f2 += WARP_SHUFFLE(warp_mask, f2, i+1, 8);
765  f3 += WARP_SHUFFLE(warp_mask, f3, i+1, 8);
766 
767  if (i == 0) {
768  shmem[base].f1 = f1;
769  shmem[base].f2 = f2;
770  shmem[base].f3 = f3;
771  }
772 
773  base += 8;
774  warp_mask = WARP_BALLOT(warp_mask, (base < base_end) );
775  }
776 
777  // Write forces
778  BLOCK_SYNC;
779  if (pos < pos_end && threadIdx.y == 0) {
780  float f1 = shmem[threadIdx.x].f1;
781  float f2 = shmem[threadIdx.x].f2;
782  float f3 = shmem[threadIdx.x].f3;
783  float q = -shmem[threadIdx.x].charge; //*ccelec_float;
784  // float fx = q*recip1*f1*nfftx;
785  // float fy = q*recip2*f2*nffty;
786  // float fz = q*recip3*f3*nfftz;
787  float fx = q*f1*nfftx;
788  float fy = q*f2*nffty;
789  float fz = q*f3*nfftz;
790  gather_force_store<FT>(fx, fy, fz, stride, pos, force);
791  }
792 
793 }
794 
795 const int TILEDIM = 32;
796 const int TILEROWS = 8;
797 
798 template <typename T>
799 __device__ __forceinline__
801  const int x_in, const int y_in, const int z_in,
802  const int x_out, const int y_out,
803  const int nx, const int ny, const int nz,
804  const int xsize_in, const int ysize_in,
805  const int ysize_out, const int zsize_out,
806  const T* data_in, T* data_out) {
807 
808  // Shared memory
809  __shared__ T tile[TILEDIM][TILEDIM+1];
810 
811  // Read (x,y) data_in into tile (shared memory)
812  for (int j=0;j < TILEDIM;j += TILEROWS)
813  if ((x_in < nx) && (y_in + j < ny) && (z_in < nz))
814  tile[threadIdx.y + j][threadIdx.x] = data_in[x_in + (y_in + j + z_in*ysize_in)*xsize_in];
815 
816  BLOCK_SYNC;
817 
818  // Write (y,x) tile into data_out
819  const int z_out = z_in;
820  for (int j=0;j < TILEDIM;j += TILEROWS)
821  if ((x_out + j < nx) && (y_out < ny) && (z_out < nz))
822  data_out[y_out + (z_out + (x_out+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
823 }
824 
825 //
826 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
827 //
828 template <typename T>
829 __global__ void transpose_xyz_yzx_kernel(
830  const int nx, const int ny, const int nz,
831  const int xsize_in, const int ysize_in,
832  const int ysize_out, const int zsize_out,
833  const T* data_in, T* data_out) {
834 
835  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
836  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
837  int z_in = blockIdx.z + threadIdx.z;
838 
839  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
840  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
841 
842  transpose_xyz_yzx_device<T>(
843  x_in, y_in, z_in,
844  x_out, y_out,
845  nx, ny, nz,
846  xsize_in, ysize_in,
847  ysize_out, zsize_out,
848  data_in, data_out);
849 
850 /*
851  // Shared memory
852  __shared__ T tile[TILEDIM][TILEDIM+1];
853 
854  int x = blockIdx.x * TILEDIM + threadIdx.x;
855  int y = blockIdx.y * TILEDIM + threadIdx.y;
856  int z = blockIdx.z + threadIdx.z;
857 
858  // Read (x,y) data_in into tile (shared memory)
859  for (int j=0;j < TILEDIM;j += TILEROWS)
860  if ((x < nx) && (y + j < ny) && (z < nz))
861  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
862 
863  BLOCK_SYNC;
864 
865  // Write (y,x) tile into data_out
866  x = blockIdx.x * TILEDIM + threadIdx.y;
867  y = blockIdx.y * TILEDIM + threadIdx.x;
868  for (int j=0;j < TILEDIM;j += TILEROWS)
869  if ((x + j < nx) && (y < ny) && (z < nz))
870  data_out[y + (z + (x+j)*zsize_out)*ysize_out] = tile[threadIdx.x][threadIdx.y + j];
871 */
872 }
873 
874 //
875 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(y, z, x)
876 // Batch index bi is encoded in blockIdx.z, where
877 // blockIdx.z = 0...nz-1 are for batch 1
878 // blockIdx.z = nz...2*nz-1 are for batch 2
879 // ...
880 // gridDim.z = nz*numBatches
881 //
882 template <typename T>
884  const TransposeBatch<T>* batches,
885  const int ny, const int nz,
886  const int xsize_in, const int ysize_in) {
887 
888  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
889  int y_in = blockIdx.y * TILEDIM + threadIdx.y;
890  int z_in = (blockIdx.z % nz) + threadIdx.z;
891 
892  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
893  int y_out = blockIdx.y * TILEDIM + threadIdx.x;
894 
895  int bi = blockIdx.z/nz;
896 
897  TransposeBatch<T> batch = batches[bi];
898  int nx = batch.nx;
899  int ysize_out = batch.ysize_out;
900  int zsize_out = batch.zsize_out;
901  T* data_in = batch.data_in;
902  T* data_out = batch.data_out;
903 
904  transpose_xyz_yzx_device<T>(
905  x_in, y_in, z_in,
906  x_out, y_out,
907  nx, ny, nz,
908  xsize_in, ysize_in,
909  ysize_out, zsize_out,
910  data_in, data_out);
911 
912 }
913 
914 /*
915 //
916 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
917 //
918 template <typename T>
919 __forceinline__ __device__
920 void transpose_xyz_yzx_dev(
921  const int blockz,
922  const int nx, const int ny, const int nz,
923  const int xsize_in, const int ysize_in,
924  const int xsize_out, const int ysize_out,
925  const T* data_in, T* data_out) {
926 
927  // Shared memory
928  __shared__ T tile[TILEDIM][TILEDIM+1];
929 
930  int x = blockIdx.x * TILEDIM + threadIdx.x;
931  int y = blockIdx.y * TILEDIM + threadIdx.y;
932  // int z = blockIdx.z + threadIdx.z;
933  int z = blockz + threadIdx.z;
934 
935  // Read (x,y) data_in into tile (shared memory)
936  for (int j=0;j < TILEDIM;j += TILEROWS)
937  if ((x < nx) && (y + j < ny) && (z < nz))
938  tile[threadIdx.y + j][threadIdx.x] = data_in[x + (y + j + z*ysize_in)*xsize_in];
939 
940  BLOCK_SYNC;
941 
942  // Write (y,x) tile into data_out
943  x = blockIdx.x * TILEDIM + threadIdx.y;
944  y = blockIdx.y * TILEDIM + threadIdx.x;
945  for (int j=0;j < TILEDIM;j += TILEROWS)
946  if ((x + j < nx) && (y < ny) && (z < nz))
947  data_out[y + (z + (x+j)*ysize_out)*xsize_out] = tile[threadIdx.x][threadIdx.y + j];
948 
949 }
950 
951 //
952 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(y, z, x)
953 // (nx, ny, nz) = size of the transposed volume
954 // (xsize_in, ysize_in, zsize_in) = size of the input data
955 // into nblock memory blocks
956 //
957 template <typename T>
958 __global__ void transpose_xyz_yzx_kernel(
959  const int nblock,
960  const int nx, const int ny, const int nz,
961  const int xsize_in, const int ysize_in,
962  const int xsize_out, const int ysize_out,
963  const T* data_in, T* data_out) {
964 
965  const int iblock = blockIdx.z/nz;
966 
967  if (iblock < nblock) {
968  transpose_xyz_yzx_dev(blockIdx.z % nz, nx, ny, nz,
969  xsize_in, ysize_in, xsize_out, ysize_out,
970  data_in, data_out);
971  }
972 
973 }
974 */
975 
976 template <typename T>
977 __device__ __forceinline__
979  const int x_in, const int y_in, const int z_in,
980  const int x_out, const int z_out,
981  const int nx, const int ny, const int nz,
982  const int xsize_in, const int ysize_in,
983  const int zsize_out, const int xsize_out,
984  const T* data_in, T* data_out) {
985 
986  // Shared memory
987  __shared__ T tile[TILEDIM][TILEDIM+1];
988 
989  // Read (x,z) data_in into tile (shared memory)
990  for (int k=0;k < TILEDIM;k += TILEROWS)
991  if ((x_in < nx) && (y_in < ny) && (z_in + k < nz))
992  tile[threadIdx.y + k][threadIdx.x] = data_in[x_in + (y_in + (z_in + k)*ysize_in)*xsize_in];
993 
994  BLOCK_SYNC;
995 
996  // Write (z,x) tile into data_out
997  const int y_out = y_in;
998  for (int k=0;k < TILEDIM;k += TILEROWS)
999  if ((x_out + k < nx) && (y_out < ny) && (z_out < nz))
1000  data_out[z_out + (x_out + k + y_out*xsize_out)*zsize_out] = tile[threadIdx.x][threadIdx.y + k];
1001 }
1002 
1003 //
1004 // Transposes a 3d matrix out-of-place: data_in(x, y, z) -> data_out(z, x, y)
1005 //
1006 template <typename T>
1008  const int nx, const int ny, const int nz,
1009  const int xsize_in, const int ysize_in,
1010  const int zsize_out, const int xsize_out,
1011  const T* data_in, T* data_out) {
1012 
1013  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1014  int y_in = blockIdx.z + threadIdx.z;
1015  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1016 
1017  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1018  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1019 
1020  transpose_xyz_zxy_device<T>(
1021  x_in, y_in, z_in, x_out, z_out,
1022  nx, ny, nz,
1023  xsize_in, ysize_in,
1024  zsize_out, xsize_out,
1025  data_in, data_out);
1026 
1027 }
1028 
1029 //
1030 // Transposes a batch of 3d matrices out-of-place: data_in(x, y, z) -> data_out(z, x, y)
1031 // Batch index bi is encoded in blockIdx.z, where
1032 // blockIdx.z = 0...ny-1 are for batch 1
1033 // blockIdx.z = ny...2*ny-1 are for batch 2
1034 // ...
1035 // gridDim.z = ny*numBatches
1036 //
1037 template <typename T>
1039  const TransposeBatch<T>* batches,
1040  const int ny, const int nz,
1041  const int xsize_in, const int ysize_in) {
1042 
1043  int x_in = blockIdx.x * TILEDIM + threadIdx.x;
1044  int y_in = (blockIdx.z % ny) + threadIdx.z;
1045  int z_in = blockIdx.y * TILEDIM + threadIdx.y;
1046 
1047  int x_out = blockIdx.x * TILEDIM + threadIdx.y;
1048  int z_out = blockIdx.y * TILEDIM + threadIdx.x;
1049 
1050  int bi = blockIdx.z/ny;
1051 
1052  TransposeBatch<T> batch = batches[bi];
1053  int nx = batch.nx;
1054  int zsize_out = batch.zsize_out;
1055  int xsize_out = batch.xsize_out;
1056  T* data_in = batch.data_in;
1057  T* data_out = batch.data_out;
1058 
1059  transpose_xyz_zxy_device<T>(
1060  x_in, y_in, z_in, x_out, z_out,
1061  nx, ny, nz,
1062  xsize_in, ysize_in,
1063  zsize_out, xsize_out,
1064  data_in, data_out);
1065 
1066 }
1067 
1068 //#######################################################################################
1069 //#######################################################################################
1070 //#######################################################################################
1071 
1072 void spread_charge(const float4 *atoms, const int numAtoms,
1073  const int nfftx, const int nffty, const int nfftz,
1074  const int xsize, const int ysize, const int zsize,
1075  const int xdim, const int y00, const int z00,
1076  const bool periodicY, const bool periodicZ,
1077  float* data, const int order, cudaStream_t stream) {
1078 
1079  dim3 nthread, nblock;
1080  const int order3 = ((order*order*order-1)/WARPSIZE + 1)*WARPSIZE;
1081  switch(order) {
1082  case 4:
1083  nthread.x = WARPSIZE;
1084 #if defined(NAMD_CUDA)
1085  nthread.y = 4;
1086 #else
1087  nthread.y = order3 / WARPSIZE;
1088 #endif
1089  nthread.z = 1;
1090  nblock.x = (numAtoms - 1)/nthread.x + 1;
1091  nblock.y = 1;
1092  nblock.z = 1;
1093  if (periodicY && periodicZ)
1094  spread_charge_kernel<float, 4, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1095  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1096  else if (periodicY)
1097  spread_charge_kernel<float, 4, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1098  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1099  else if (periodicZ)
1100  spread_charge_kernel<float, 4, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1101  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1102  else
1103  spread_charge_kernel<float, 4, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1104  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1105  break;
1106 
1107  case 6:
1108  nthread.x = WARPSIZE;
1109  nthread.y = order3/WARPSIZE;
1110  nthread.z = 1;
1111  nblock.x = (numAtoms - 1)/nthread.x + 1;
1112  nblock.y = 1;
1113  nblock.z = 1;
1114  if (periodicY && periodicZ)
1115  spread_charge_kernel<float, 6, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1116  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1117  else if (periodicY)
1118  spread_charge_kernel<float, 6, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1119  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1120  else if (periodicZ)
1121  spread_charge_kernel<float, 6, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1122  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1123  else
1124  spread_charge_kernel<float, 6, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1125  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1126  break;
1127 
1128  case 8:
1129  nthread.x = WARPSIZE;
1130  nthread.y = order3/WARPSIZE;
1131  nthread.z = 1;
1132  nblock.x = (numAtoms - 1)/nthread.x + 1;
1133  nblock.y = 1;
1134  nblock.z = 1;
1135  if (periodicY && periodicZ)
1136  spread_charge_kernel<float, 8, true, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1137  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1138  else if (periodicY)
1139  spread_charge_kernel<float, 8, true, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1140  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1141  else if (periodicZ)
1142  spread_charge_kernel<float, 8, false, true> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1143  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1144  else
1145  spread_charge_kernel<float, 8, false, false> <<< nblock, nthread, 0, stream >>> (atoms, numAtoms,
1146  nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00, data);
1147  break;
1148 
1149  default:
1150  char str[128];
1151  sprintf(str, "spread_charge, order %d not implemented",order);
1152  cudaNAMD_bug(str);
1153  }
1154  cudaCheck(cudaGetLastError());
1155 
1156 }
1157 
1158 void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3,
1159  const int size1, const int size2, const int size3, const double kappa,
1160  const float recip1x, const float recip1y, const float recip1z,
1161  const float recip2x, const float recip2y, const float recip2z,
1162  const float recip3x, const float recip3y, const float recip3z,
1163  const double volume,
1164  const float* prefac1, const float* prefac2, const float* prefac3,
1165  const int k2_00, const int k3_00,
1166  const bool doEnergyVirial, double* energy, double* virial, float2* data,
1167  cudaStream_t stream) {
1168 #ifdef NAMD_CUDA
1169  int nthread = 1024;
1170  int nblock = 64;
1171 #else
1172  int nthread = 256;
1173  int nblock = 256;
1174 #endif
1175 
1176  int shmem_size = sizeof(float)*(nfft1 + nfft2 + nfft3);
1177  if (doEnergyVirial) {
1178  shmem_size = max(shmem_size, (int)((nthread/WARPSIZE)*sizeof(RecipVirial_t)));
1179  }
1180 
1181  float piv_inv = (float)(1.0/(M_PI*volume));
1182  float fac = (float)(M_PI*M_PI/(kappa*kappa));
1183 
1184  if (doEnergyVirial) {
1185  if (orderXYZ) {
1186  scalar_sum_kernel<float, float2, true, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1187  nfft1, nfft2, nfft3, size1, size2, size3,
1188  recip1x, recip1y, recip1z,
1189  recip2x, recip2y, recip2z,
1190  recip3x, recip3y, recip3z,
1191  prefac1, prefac2, prefac3,
1192  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1193  } else {
1194  scalar_sum_kernel<float, float2, true, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1195  nfft1, nfft2, nfft3, size1, size2, size3,
1196  recip1x, recip1y, recip1z,
1197  recip2x, recip2y, recip2z,
1198  recip3x, recip3y, recip3z,
1199  prefac1, prefac2, prefac3,
1200  fac, piv_inv, k2_00, k3_00, data, energy, virial);
1201  }
1202  } else {
1203  if (orderXYZ) {
1204  scalar_sum_kernel<float, float2, false, true, false> <<< nblock, nthread, shmem_size, stream >>> (
1205  nfft1, nfft2, nfft3, size1, size2, size3,
1206  recip1x, recip1y, recip1z,
1207  recip2x, recip2y, recip2z,
1208  recip3x, recip3y, recip3z,
1209  prefac1, prefac2, prefac3,
1210  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1211  } else {
1212  scalar_sum_kernel<float, float2, false, false, false> <<< nblock, nthread, shmem_size, stream >>> (
1213  nfft1, nfft2, nfft3, size1, size2, size3,
1214  recip1x, recip1y, recip1z,
1215  recip2x, recip2y, recip2z,
1216  recip3x, recip3y, recip3z,
1217  prefac1, prefac2, prefac3,
1218  fac, piv_inv, k2_00, k3_00, data, NULL, NULL);
1219  }
1220  }
1221  cudaCheck(cudaGetLastError());
1222 
1223 }
1224 
1225 void gather_force(const float4 *atoms, const int numAtoms,
1226  // const float recip11, const float recip22, const float recip33,
1227  const int nfftx, const int nffty, const int nfftz,
1228  const int xsize, const int ysize, const int zsize,
1229  const int xdim, const int y00, const int z00,
1230  const bool periodicY, const bool periodicZ,
1231  const float* data, const int order, float3* force,
1232 #ifdef NAMD_CUDA
1233  const cudaTextureObject_t gridTexObj,
1234 #endif
1235  cudaStream_t stream) {
1236 
1237  dim3 nthread(32, 2, 1);
1238  dim3 nblock((numAtoms - 1)/nthread.x + 1, 1, 1);
1239  // dim3 nblock(npatch, 1, 1);
1240 
1241  switch(order) {
1242  case 4:
1243  if (periodicY && periodicZ)
1244  gather_force<float, float3, 4, true, true> <<< nblock, nthread, 0, stream >>> (
1245  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1246  // recip11, recip22, recip33,
1247  data,
1248 #ifdef NAMD_CUDA
1249  gridTexObj,
1250 #endif
1251  1, force);
1252  else if (periodicY)
1253  gather_force<float, float3, 4, true, false> <<< nblock, nthread, 0, stream >>> (
1254  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1255  // recip11, recip22, recip33,
1256  data,
1257 #ifdef NAMD_CUDA
1258  gridTexObj,
1259 #endif
1260  1, force);
1261  else if (periodicZ)
1262  gather_force<float, float3, 4, false, true> <<< nblock, nthread, 0, stream >>> (
1263  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1264  // recip11, recip22, recip33,
1265  data,
1266 #ifdef NAMD_CUDA
1267  gridTexObj,
1268 #endif
1269  1, force);
1270  else
1271  gather_force<float, float3, 4, false, false> <<< nblock, nthread, 0, stream >>> (
1272  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1273  // recip11, recip22, recip33,
1274  data,
1275 #ifdef NAMD_CUDA
1276  gridTexObj,
1277 #endif
1278  1, force);
1279  break;
1280 
1281  case 6:
1282  if (periodicY && periodicZ)
1283  gather_force<float, float3, 6, true, true> <<< nblock, nthread, 0, stream >>> (
1284  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1285  // recip11, recip22, recip33,
1286  data,
1287 #ifdef NAMD_CUDA
1288  gridTexObj,
1289 #endif
1290  1, force);
1291  else if (periodicY)
1292  gather_force<float, float3, 6, true, false> <<< nblock, nthread, 0, stream >>> (
1293  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1294  // recip11, recip22, recip33,
1295  data,
1296 #ifdef NAMD_CUDA
1297  gridTexObj,
1298 #endif
1299  1, force);
1300  else if (periodicZ)
1301  gather_force<float, float3, 6, false, true> <<< nblock, nthread, 0, stream >>> (
1302  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1303  // recip11, recip22, recip33,
1304  data,
1305 #ifdef NAMD_CUDA
1306  gridTexObj,
1307 #endif
1308  1, force);
1309  else
1310  gather_force<float, float3, 6, false, false> <<< nblock, nthread, 0, stream >>> (
1311  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1312  // recip11, recip22, recip33,
1313  data,
1314 #ifdef NAMD_CUDA
1315  gridTexObj,
1316 #endif
1317  1, force);
1318  break;
1319 
1320  case 8:
1321  if (periodicY && periodicZ)
1322  gather_force<float, float3, 8, true, true> <<< nblock, nthread, 0, stream >>> (
1323  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1324  // recip11, recip22, recip33,
1325  data,
1326 #ifdef NAMD_CUDA
1327  gridTexObj,
1328 #endif
1329  1, force);
1330  else if (periodicY)
1331  gather_force<float, float3, 8, true, false> <<< nblock, nthread, 0, stream >>> (
1332  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1333  // recip11, recip22, recip33,
1334  data,
1335 #ifdef NAMD_CUDA
1336  gridTexObj,
1337 #endif
1338  1, force);
1339  else if (periodicZ)
1340  gather_force<float, float3, 8, false, true> <<< nblock, nthread, 0, stream >>> (
1341  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1342  // recip11, recip22, recip33,
1343  data,
1344 #ifdef NAMD_CUDA
1345  gridTexObj,
1346 #endif
1347  1, force);
1348  else
1349  gather_force<float, float3, 8, false, false> <<< nblock, nthread, 0, stream >>> (
1350  atoms, numAtoms, nfftx, nffty, nfftz, xsize, ysize, zsize, xdim, y00, z00,
1351  // recip11, recip22, recip33,
1352  data,
1353 #ifdef NAMD_CUDA
1354  gridTexObj,
1355 #endif
1356  1, force);
1357  break;
1358 
1359  default:
1360  char str[128];
1361  sprintf(str, "gather_force, order %d not implemented",order);
1362  cudaNAMD_bug(str);
1363  }
1364  cudaCheck(cudaGetLastError());
1365 
1366 }
1367 
1368 //
1369 // Transpose
1370 //
1372  const int nx, const int ny, const int nz,
1373  const int xsize_in, const int ysize_in,
1374  const int ysize_out, const int zsize_out,
1375  const float2* data_in, float2* data_out, cudaStream_t stream) {
1376 
1377  dim3 numthread(TILEDIM, TILEROWS, 1);
1378  dim3 numblock((nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz);
1379  transpose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1380  nx, ny, nz, xsize_in, ysize_in,
1381  ysize_out, zsize_out,
1382  data_in, data_out);
1383 
1384  cudaCheck(cudaGetLastError());
1385 }
1386 
1387 //
1388 // Batched transpose
1389 //
1391  const int numBatches, TransposeBatch<float2>* batches,
1392  const int max_nx, const int ny, const int nz,
1393  const int xsize_in, const int ysize_in, cudaStream_t stream) {
1394 
1395  dim3 numthread(TILEDIM, TILEROWS, 1);
1396  dim3 numblock((max_nx-1)/TILEDIM+1, (ny-1)/TILEDIM+1, nz*numBatches);
1397 
1398  batchTranspose_xyz_yzx_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
1399 
1400  cudaCheck(cudaGetLastError());
1401 }
1402 
1403 //
1404 // Transpose
1405 //
1407  const int nx, const int ny, const int nz,
1408  const int xsize_in, const int ysize_in,
1409  const int zsize_out, const int xsize_out,
1410  const float2* data_in, float2* data_out, cudaStream_t stream) {
1411 
1412  dim3 numthread(TILEDIM, TILEROWS, 1);
1413  dim3 numblock((nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny);
1414  transpose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (
1415  nx, ny, nz, xsize_in, ysize_in,
1416  zsize_out, xsize_out,
1417  data_in, data_out);
1418 
1419  cudaCheck(cudaGetLastError());
1420 }
1421 
1422 //
1423 // Batched transpose
1424 //
1426  const int numBatches, TransposeBatch<float2>* batches,
1427  const int max_nx, const int ny, const int nz,
1428  const int xsize_in, const int ysize_in,
1429  cudaStream_t stream) {
1430 
1431  dim3 numthread(TILEDIM, TILEROWS, 1);
1432  dim3 numblock((max_nx-1)/TILEDIM+1, (nz-1)/TILEDIM+1, ny*numBatches);
1433 
1434  batchTranspose_xyz_zxy_kernel<float2> <<< numblock, numthread, 0, stream >>> (batches, ny, nz, xsize_in, ysize_in);
1435 
1436  cudaCheck(cudaGetLastError());
1437 }
1438 
1439 #endif // NAMD_CUDA
#define M_PI
Definition: GoMolecule.C:39
const int TILEDIM
__forceinline__ __device__ void gather_force_store(const float fx, const float fy, const float fz, const int stride, const int pos, T *force)
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
void batchTranspose_xyz_yzx(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void scalar_sum(const bool orderXYZ, const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const double kappa, const float recip1x, const float recip1y, const float recip1z, const float recip2x, const float recip2y, const float recip2z, const float recip3x, const float recip3y, const float recip3z, const double volume, const float *prefac1, const float *prefac2, const float *prefac3, const int k2_00, const int k3_00, const bool doEnergyVirial, double *energy, double *virial, float2 *data, cudaStream_t stream)
__forceinline__ __device__ void write_grid(const float val, const int ind, T *data)
static __thread atom * atoms
void transpose_xyz_zxy(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
if(ComputeNonbondedUtil::goMethod==2)
#define WARPSIZE
Definition: CudaUtils.h:10
__thread cudaStream_t stream
__forceinline__ __device__ void calc_theta_dtheta(T wx, T wy, T wz, T3 *theta, T3 *dtheta)
__global__ void scalar_sum_kernel(const int nfft1, const int nfft2, const int nfft3, const int size1, const int size2, const int size3, const T recip1x, const T recip1y, const T recip1z, const T recip2x, const T recip2y, const T recip2z, const T recip3x, const T recip3y, const T recip3z, const T *prefac1, const T *prefac2, const T *prefac3, const T pi_ewald, const T piv_inv, const int k2_00, const int k3_00, T2 *data, double *__restrict__ energy_recip, double *__restrict__ virial)
#define order
Definition: PmeRealSpace.C:235
unsigned int WarpMask
Definition: CudaUtils.h:11
gridSize z
__global__ void transpose_xyz_zxy_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
#define WARP_BALLOT(MASK, P)
Definition: CudaUtils.h:58
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
__forceinline__ __device__ void calc_one_theta(const T w, T *theta)
const int TILEROWS
void batchTranspose_xyz_zxy(const int numBatches, TransposeBatch< float2 > *batches, const int max_nx, const int ny, const int nz, const int xsize_in, const int ysize_in, cudaStream_t stream)
void cudaNAMD_bug(const char *msg)
Definition: CudaUtils.C:31
__global__ void gather_force(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const float *data, const cudaTextureObject_t gridTexObj, const int stride, FT *force)
void spread_charge(const float4 *atoms, const int numAtoms, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, const bool periodicY, const bool periodicZ, float *data, const int order, cudaStream_t stream)
__device__ __forceinline__ void transpose_xyz_zxy_device(const int x_in, const int y_in, const int z_in, const int x_out, const int z_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int zsize_out, const int xsize_out, const T *data_in, T *data_out)
__device__ __forceinline__ void transpose_xyz_yzx_device(const int x_in, const int y_in, const int z_in, const int x_out, const int y_out, const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
void transpose_xyz_yzx(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const float2 *data_in, float2 *data_out, cudaStream_t stream)
__global__ void spread_charge_kernel(const float4 *xyzq, const int ncoord, const int nfftx, const int nffty, const int nfftz, const int xsize, const int ysize, const int zsize, const int xdim, const int y00, const int z00, AT *data)
__forceinline__ __device__ void gather_force_store< float >(const float fx, const float fy, const float fz, const int stride, const int pos, float *force)
gridSize y
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
__forceinline__ __device__ void gather_force_store< float3 >(const float fx, const float fy, const float fz, const int stride, const int pos, float3 *force)
__forceinline__ __device__ double expfunc(const double x)
__global__ void batchTranspose_xyz_yzx_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
gridSize x
__global__ void transpose_xyz_yzx_kernel(const int nx, const int ny, const int nz, const int xsize_in, const int ysize_in, const int ysize_out, const int zsize_out, const T *data_in, T *data_out)
__global__ void batchTranspose_xyz_zxy_kernel(const TransposeBatch< T > *batches, const int ny, const int nz, const int xsize_in, const int ysize_in)
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
for(int i=0;i< n1;++i)