NAMD
CudaComputeNonbondedKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #include <cuda.h>
3 #if __CUDACC_VER_MAJOR__ >= 11
4 #include <cub/cub.cuh>
5 #else
6 #include <namd_cub/cub.cuh>
7 #endif //CUDACC version
8 #endif //NAMD_CUDA
9 #ifdef NAMD_HIP //NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #include <hipcub/hipcub.hpp>
12 #define cub hipcub
13 #include "HipDefines.h"
14 #endif
16 #include "CudaTileListKernel.h"
17 #include "DeviceCUDA.h"
18 #ifdef WIN32
19 #define __thread __declspec(thread)
20 #endif
21 extern __thread DeviceCUDA *deviceCUDA;
22 
23 #ifdef NAMD_CUDA //Handles NVIDIA
24 #define NONBONDKERNEL_NUM_WARP 4
25 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
26 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
27 #define REDUCEGBISENERGYKERNEL_NUM_WARP 32
28 #else // Handles AMD
29 #define NONBONDKERNEL_NUM_WARP 1
30 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 4
31 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 4
32 #define REDUCEGBISENERGYKERNEL_NUM_WARP 4
33 #define __ldg *
34 #endif
35 #define OVERALLOC 1.2f
36 
37 void NAMD_die(const char *);
38 
39 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
40 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
41 
42 
43 __device__ __forceinline__
44 float4 sampleTableTex(cudaTextureObject_t tex, float k) {
45  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
46  const float x = k * (float)tableSize - 0.5f;
47  const float f = floorf(x);
48  const float a = x - f;
49  const unsigned int i = (unsigned int)f;
50  const int i0 = i < tableSize - 1 ? i : tableSize - 1;
51  const int i1 = i0 + 1;
52  const float4 t0 = tex1Dfetch<float4>(tex, i0);
53  const float4 t1 = tex1Dfetch<float4>(tex, i1);
54  return make_float4(
55  a * (t1.x - t0.x) + t0.x,
56  a * (t1.y - t0.y) + t0.y,
57  a * (t1.z - t0.z) + t0.z,
58  a * (t1.w - t0.w) + t0.w);
59 }
60 
61 #ifndef NAMD_CUDA
62 
63 // HIP implementation of tex1D has lower performance than this custom implementation of
64 // linear filtering
65 
66 // #define USE_TABLE_ARRAYS
67 
68 __device__ __forceinline__
69 float4 tableLookup(const float4* table, const float k)
70 {
71  const int tableSize = FORCE_ENERGY_TABLE_SIZE;
72  const float x = k * static_cast<float>(tableSize) - 0.5f;
73  const float f = floorf(x);
74  const float a = x - f;
75  const int i = static_cast<int>(f);
76  const int i0 = max(0, min(tableSize - 1, i));
77  const int i1 = max(0, min(tableSize - 1, i + 1));
78  const float4 t0 = __ldg(&table[i0]);
79  const float4 t1 = __ldg(&table[i1]);
80  return make_float4(
81  a * (t1.x - t0.x) + t0.x,
82  a * (t1.y - t0.y) + t0.y,
83  a * (t1.z - t0.z) + t0.z,
84  a * (t1.w - t0.w) + t0.w);
85 }
86 
87 #endif
88 
89 template<bool doEnergy, bool doSlow>
90 __device__ __forceinline__
91 void calcForceEnergy(const float r2, const float qi, const float qj,
92  const float dx, const float dy, const float dz,
93  const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
94 #ifdef USE_TABLE_ARRAYS
95  const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
96 #else
97  cudaTextureObject_t vdwCoefTableTex,
98  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
99 #endif
100  float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
101  float& energyVdw, float& energyElec, float& energySlow) {
102 
103  int vdwIndex = vdwtypej + vdwtypei;
104 #if __CUDA_ARCH__ >= 350
105  float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
106 #else
107  float2 ljab = tex1Dfetch<float2>(vdwCoefTableTex, vdwIndex);
108 #endif
109 
110  float rinv = __frsqrt_rn(r2);
111  float4 ei;
112 
113 #ifdef NAMD_HIP
114  float4 fi = sampleTableTex(forceTableTex, rinv);
115  if (doEnergy) ei = sampleTableTex(energyTableTex, rinv);
116 #else
117  float4 fi = tex1D<float4>(forceTableTex, rinv);
118  if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
119 #endif
120 
121  float fSlow = qi * qj;
122  float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
123 
124  if (doEnergy) {
125  energyVdw += ljab.x * ei.z + ljab.y * ei.y;
126  energyElec += fSlow * ei.x;
127  if (doSlow) energySlow += fSlow * ei.w;
128  }
129  if (doSlow) fSlow *= fi.w;
130 
131  float fx = dx * f;
132  float fy = dy * f;
133  float fz = dz * f;
134  iforce.x += fx;
135  iforce.y += fy;
136  iforce.z += fz;
137  jforce.x -= fx;
138  jforce.y -= fy;
139  jforce.z -= fz;
140 
141  if (doSlow) {
142  float fxSlow = dx * fSlow;
143  float fySlow = dy * fSlow;
144  float fzSlow = dz * fSlow;
145  iforceSlow.x += fxSlow;
146  iforceSlow.y += fySlow;
147  iforceSlow.z += fzSlow;
148  jforceSlow.x -= fxSlow;
149  jforceSlow.y -= fySlow;
150  jforceSlow.z -= fzSlow;
151  }
152 }
153 
154 template<bool doSlow>
155 __device__ __forceinline__
156 void storeForces(const int pos, const float3 force, const float3 forceSlow,
157  float* __restrict__ devForces_x,
158  float* __restrict__ devForces_y,
159  float* __restrict__ devForces_z,
160  float* __restrict__ devForcesSlow_x,
161  float* __restrict__ devForcesSlow_y,
162  float* __restrict__ devForcesSlow_z)
163 {
164 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR == 3) && (HIP_VERSION_MINOR > 3) || (HIP_VERSION_MAJOR > 3))
165  if (force.x != 0.0f || force.y != 0.0f || force.z != 0.0f) {
166  atomicAddNoRet(&devForces_x[pos], force.x);
167  atomicAddNoRet(&devForces_y[pos], force.y);
168  atomicAddNoRet(&devForces_z[pos], force.z);
169  }
170  if (doSlow) {
171  if (forceSlow.x != 0.0f || forceSlow.y != 0.0f || forceSlow.z != 0.0f) {
172  atomicAddNoRet(&devForcesSlow_x[pos], forceSlow.x);
173  atomicAddNoRet(&devForcesSlow_y[pos], forceSlow.y);
174  atomicAddNoRet(&devForcesSlow_z[pos], forceSlow.z);
175  }
176  }
177 #else
178  atomicAdd(&devForces_x[pos], force.x);
179  atomicAdd(&devForces_y[pos], force.y);
180  atomicAdd(&devForces_z[pos], force.z);
181  if (doSlow) {
182  atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
183  atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
184  atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
185  }
186 #endif
187 }
188 
189 
190 template<bool doPairlist>
191 __device__ __forceinline__
192 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex, int& jexclMaxdiff, int& jexclIndex) {
193  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
194  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
195  if (doPairlist) {
196  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
197  jexclIndex = WARP_SHUFFLE(WARP_FULL_MASK, jexclIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
198  jexclMaxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jexclMaxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
199  }
200 }
201 
202 template<bool doPairlist>
203 __device__ __forceinline__
204 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex) {
205  xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
206  vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
207  if (doPairlist) {
208  jatomIndex = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
209  }
210 }
211 
212 template<bool doSlow>
213 __device__ __forceinline__
214 void shuffleNext(float3& jforce, float3& jforceSlow) {
215  jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
216  jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
217  jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
218  if (doSlow) {
219  jforceSlow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
220  jforceSlow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
221  jforceSlow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
222  }
223 }
224 
225 //#define USE_NEW_EXCL_METHOD
226 
227 //
228 // Returns the lower estimate for the distance between a bounding box and a set of atoms
229 //
230 __device__ __forceinline__ float distsq(const BoundingBox a, const float4 b) {
231  float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
232  float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
233  float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
234  float r2 = dx*dx + dy*dy + dz*dz;
235  return r2;
236 }
237 
238 #define LARGE_FLOAT (float)(1.0e10)
239 
240 //
241 // Nonbonded force kernel
242 //
243 template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
244 __global__ void
245 #ifndef NAMD_CUDA
246 // TODO-HIP: The second parameter has different meaning on HCC.
247 // Using 10 as above reduces performance dramatically.
249 #else
250 __launch_bounds__(WARPSIZE*NONBONDKERNEL_NUM_WARP,
251  doPairlist ? (10) : (doEnergy ? (10) : (10) ))
252 #endif
253 nonbondedForceKernel(
254  const int start, const int numTileLists,
255  const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
256  const int* __restrict__ tileJatomStart,
257  const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex,
258  const int* __restrict__ vdwTypes,
259  const float3 lata, const float3 latb, const float3 latc,
260  const float4* __restrict__ xyzq, const float cutoff2,
261 #ifdef USE_TABLE_ARRAYS
262  const float4* __restrict__ forceTable, const float4* __restrict__ energyTable,
263 #else
264  cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
265 #endif
266  // ----------
267  // doPairlist
268  float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
269  const int* __restrict__ atomIndex,
270  const int2* __restrict__ exclIndexMaxDiff, const unsigned int* __restrict__ overflowExclusions,
271  unsigned int* __restrict__ tileListDepth, int* __restrict__ tileListOrder,
272  int* __restrict__ jtiles, TileListStat* __restrict__ tileListStat,
273  const BoundingBox* __restrict__ boundingBoxes,
274 #ifdef USE_NEW_EXCL_METHOD
275  const int* __restrict__ minmaxExclAtom,
276 #endif
277  // ----------
278  float * __restrict__ devForce_x,
279  float * __restrict__ devForce_y,
280  float * __restrict__ devForce_z,
281  float * __restrict__ devForce_w,
282  float * __restrict__ devForceSlow_x,
283  float * __restrict__ devForceSlow_y,
284  float * __restrict__ devForceSlow_z,
285  float * __restrict__ devForceSlow_w,
286  // ---- USE_STREAMING_FORCES ----
287  const int numPatches,
288  unsigned int* __restrict__ patchNumCount,
289  const CudaPatchRecord* __restrict__ cudaPatches,
290  float4* __restrict__ mapForces, float4* __restrict__ mapForcesSlow,
291  int* __restrict__ mapPatchReadyQueue,
292  int* __restrict__ outputOrder,
293  // ------------------------------
294  TileListVirialEnergy* __restrict__ virialEnergy) {
295 
296  // Single warp takes care of one list of tiles
297  // for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
298  // int itileList = start + threadIdx.x/WARPSIZE + blockDim.x/WARPSIZE*blockIdx.x;
299  // The line above is the CUDA-exclusive version. HIPification in this case is selecting based on specific parameters
300  int itileList = start + (NONBONDKERNEL_NUM_WARP == 1 ? blockIdx.x : (threadIdx.x/WARPSIZE + NONBONDKERNEL_NUM_WARP*blockIdx.x)); if (itileList < numTileLists)
301  {
302 
303  float3 iforce;
304  float3 iforceSlow;
305  float energyVdw, energyElec, energySlow;
306  int nexcluded;
307  unsigned int itileListLen;
308  int2 patchInd;
309  int2 patchNumList;
310  __shared__ float4 s_xyzq[NONBONDKERNEL_NUM_WARP][WARPSIZE];
311  __shared__ int s_vdwtypej[NONBONDKERNEL_NUM_WARP][WARPSIZE];
312  __shared__ float3 s_jforce[NONBONDKERNEL_NUM_WARP][WARPSIZE];
313  __shared__ float3 s_jforceSlow[NONBONDKERNEL_NUM_WARP][WARPSIZE];
314  __shared__ int s_jatomIndex[NONBONDKERNEL_NUM_WARP][WARPSIZE];
315 
316  // Warp index (0...warpsize-1)
317  const int wid = threadIdx.x % WARPSIZE;
318  const int iwarp = threadIdx.x / WARPSIZE;
319 
320  // Start computation
321  {
322 
323 
325  int iatomStart = tmp.iatomStart;
326  int jtileStart = tmp.jtileStart;
327  int jtileEnd = tmp.jtileEnd;
328  patchInd = tmp.patchInd;
329  patchNumList = tmp.patchNumList;
330 
331  float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
332  float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
333  float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
334 
335  // DH - set zeroShift flag if magnitude of shift vector is zero
336  bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
337 
338  int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
339  if (doPairlist) {
341  iatomSize = PPStmp.iatomSize;
342  iatomFreeSize = PPStmp.iatomFreeSize;
343  jatomSize = PPStmp.jatomSize;
344  jatomFreeSize = PPStmp.jatomFreeSize;
345  }
346 
347  // Write to global memory here to avoid register spilling
348  if (doVirial) {
349  if (wid == 0) {
350  virialEnergy[itileList].shx = shx;
351  virialEnergy[itileList].shy = shy;
352  virialEnergy[itileList].shz = shz;
353  }
354  }
355 
356  // Load i-atom data (and shift coordinates)
357  float4 xyzq_i = xyzq[iatomStart + wid];
358  xyzq_i.x += shx;
359  xyzq_i.y += shy;
360  xyzq_i.z += shz;
361  int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
362 
363  // Load i-atom data (and shift coordinates)
364  BoundingBox boundingBoxI;
365  if (doPairlist) {
366  boundingBoxI = boundingBoxes[iatomStart/WARPSIZE];
367  boundingBoxI.x += shx;
368  boundingBoxI.y += shy;
369  boundingBoxI.z += shz;
370  }
371 
372  // Get i-atom global index
373 #ifdef USE_NEW_EXCL_METHOD
374  int iatomIndex, minExclAtom, maxExclAtom;
375 #else
376  int iatomIndex;
377 #endif
378  if (doPairlist) {
379 #ifdef USE_NEW_EXCL_METHOD
380  iatomIndex = atomIndex[iatomStart + wid];
381  int2 tmp = minmaxExclAtom[iatomStart + wid];
382  minExclAtom = tmp.x;
383  maxExclAtom = tmp.y;
384 #else
385  iatomIndex = atomIndex[iatomStart + wid];
386 #endif
387  }
388 
389  // i-forces in registers
390  // float3 iforce;
391  iforce.x = 0.0f;
392  iforce.y = 0.0f;
393  iforce.z = 0.0f;
394 
395  // float3 iforceSlow;
396  if (doSlow) {
397  iforceSlow.x = 0.0f;
398  iforceSlow.y = 0.0f;
399  iforceSlow.z = 0.0f;
400  }
401 
402  // float energyVdw, energyElec, energySlow;
403  if (doEnergy) {
404  energyVdw = 0.0f;
405  energyElec = 0.0f;
406  if (doSlow) energySlow = 0.0f;
407  }
408 
409  // Number of exclusions
410  // NOTE: Lowest bit is used as indicator bit for tile pairs:
411  // bit 0 tile has no atoms within pairlist cutoff
412  // bit 1 tile has atoms within pairlist cutoff
413  // int nexcluded;
414  if (doPairlist) nexcluded = 0;
415 
416  // Number of i loops and free atoms
417  int nfreei;
418  if (doPairlist) {
419  int nloopi = min(iatomSize - iatomStart, WARPSIZE);
420  nfreei = max(iatomFreeSize - iatomStart, 0);
421  if (wid >= nloopi) {
422  xyzq_i.x = -LARGE_FLOAT;
423  xyzq_i.y = -LARGE_FLOAT;
424  xyzq_i.z = -LARGE_FLOAT;
425  }
426  }
427 
428  // tile list stuff
429  // int itileListLen;
430  // int minJatomStart;
431  if (doPairlist) {
432  // minJatomStart = tileJatomStart[jtileStart];
433  itileListLen = 0;
434  }
435 
436  // Exclusion index and maxdiff
437  int iexclIndex, iexclMaxdiff;
438  if (doPairlist) {
439  int2 tmp = exclIndexMaxDiff[iatomStart + wid];
440  iexclIndex = tmp.x;
441  iexclMaxdiff = tmp.y;
442  }
443 
444  for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
445 
446  // Load j-atom starting index and exclusion mask
447  int jatomStart = tileJatomStart[jtile];
448 
449  float4 xyzq_j = xyzq[jatomStart + wid];
451 
452  // Check for early bail
453  if (doPairlist) {
454  float r2bb = distsq(boundingBoxI, xyzq_j);
455  if (WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
456  }
457  WarpMask excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
458  int vdwtypej = vdwTypes[jatomStart + wid];
459  s_vdwtypej[iwarp][wid] = vdwtypej;
460 
461  // Get i-atom global index
462  if (doPairlist) {
463  s_jatomIndex[iwarp][wid] = atomIndex[jatomStart + wid];
464  }
465 
466  // Number of j loops and free atoms
467  int nfreej;
468  if (doPairlist) {
469  int nloopj = min(jatomSize - jatomStart, WARPSIZE);
470  nfreej = max(jatomFreeSize - jatomStart, 0);
471  //if (nfreei == 0 && nfreej == 0) continue;
472  if (wid >= nloopj) {
473  xyzq_j.x = LARGE_FLOAT;
474  xyzq_j.y = LARGE_FLOAT;
475  xyzq_j.z = LARGE_FLOAT;
476  }
477  }
478 
479  s_xyzq[iwarp][wid] = xyzq_j;
480 
481  // DH - self requires that zeroShift is also set
482  const bool self = zeroShift && (iatomStart == jatomStart);
483  const int modval = (self) ? 2*WARPSIZE-1 : WARPSIZE-1;
484 
485  s_jforce[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
486  if (doSlow)
487  s_jforceSlow[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
489 
490 
491  int t = (self) ? 1 : 0;
492 
493  if (doPairlist) {
494  // Build pair list
495  // NOTE: Pairlist update, we must also include the diagonal since this is used
496  // in GBIS phase 2.
497  // Clear the lowest (indicator) bit
498  nexcluded &= (~1);
499 
500  // For self tiles, do the diagonal term (t=0).
501  // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
502  if (self) {
503  int j = (0 + wid) & modval;
504  xyzq_j = s_xyzq[iwarp][j];
505  float dx = xyzq_j.x - xyzq_i.x;
506  float dy = xyzq_j.y - xyzq_i.y;
507  float dz = xyzq_j.z - xyzq_i.z;
508 
509  float r2 = dx*dx + dy*dy + dz*dz;
510 
511  if (j < WARPSIZE && r2 < plcutoff2) {
512  // We have atom pair within the pairlist cutoff => Set indicator bit
513  nexcluded |= 1;
514  }
515  }
516 
517  for (;t < WARPSIZE;t++) {
518  int j = (t + wid) & modval;
519 
520  excl >>= 1;
521  if (j < WARPSIZE ) {
522  xyzq_j = s_xyzq[iwarp][j];
523  float dx = xyzq_j.x - xyzq_i.x;
524  float dy = xyzq_j.y - xyzq_i.y;
525  float dz = xyzq_j.z - xyzq_i.z;
526  float r2 = dx*dx + dy*dy + dz*dz;
527  // We have atom pair within the pairlist cutoff => Set indicator bit
528  if(r2 < plcutoff2){
529  nexcluded |= 1;
530  if (j < nfreej || wid < nfreei) {
531  bool excluded = false;
532  int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
533  if ( abs(indexdiff) <= iexclMaxdiff) {
534  indexdiff += iexclIndex;
535  int indexword = ((unsigned int) indexdiff) >> 5;
536 #ifdef NAMD_CUDA
537  if ( indexword < MAX_CONST_EXCLUSIONS ) {
538  indexword = constExclusions[indexword];
539  } else
540 #endif
541  {
542  indexword = overflowExclusions[indexword];
543  }
544 
545  excluded = ((indexword & (1<<(indexdiff&31))) != 0); // WARPSIZE-1?
546  }
547  if (excluded) nexcluded += 2;
548  if (!excluded) excl |= (WarpMask)1 << (WARPSIZE-1);
549  if (!excluded && r2 < cutoff2) {
550  calcForceEnergy<doEnergy, doSlow>(
551  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
552  vdwtypei, s_vdwtypej[iwarp][j],
554 #ifdef USE_TABLE_ARRAYS
555  forceTable, energyTable,
556 #else
558 #endif
559  iforce, iforceSlow,
560  s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
561  energyVdw,
562  energyElec, energySlow);
563  }
564  }
565  }
566  }
568  } // t
569  } else {
570  // Just compute forces
571  if (self) {
572  // Clear the first bit
573  excl = excl & (~(WarpMask)1);
574  }
575  for (int t = 0;t < WARPSIZE;t++) {
576  if ((excl & 1)) {
577  int j = (wid+t) & (WARPSIZE-1);
578  xyzq_j = s_xyzq[iwarp][j];
579  float dx = xyzq_j.x - xyzq_i.x;
580  float dy = xyzq_j.y - xyzq_i.y;
581  float dz = xyzq_j.z - xyzq_i.z;
582 
583  float r2 = dx*dx + dy*dy + dz*dz;
584 
585  if (r2 < cutoff2) {
586  calcForceEnergy<doEnergy, doSlow>(
587  r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
588  vdwtypei, s_vdwtypej[iwarp][j],
590 #ifdef USE_TABLE_ARRAYS
591  forceTable, energyTable,
592 #else
594 #endif
595  iforce, iforceSlow,
596  s_jforce[iwarp][j],
597  s_jforceSlow[iwarp][j],
598  energyVdw, energyElec, energySlow);
599  } // (r2 < cutoff2)
600  } // (excl & 1)
601  excl >>= 1;
603  } // t
604  }
605 
606  // Write j-forces
607  storeForces<doSlow>(jatomStart + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
610 
611  // Write exclusions
612  if (doPairlist && WARP_ANY(WARP_FULL_MASK, nexcluded & 1)) {
613  int anyexcl = (65536 | WARP_ANY(WARP_FULL_MASK, excl != 0));
614  // Mark this jtile as non-empty:
615  // VdW: 1 if tile has atom pairs within pairlist cutoff and some these atoms interact
616  // GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
617  if (wid == 0) jtiles[jtile] = anyexcl;
618  // Store exclusions
619  tileExcls[jtile].excl[wid] = excl;
620  // itileListLen:
621  // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
622  // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
623  itileListLen += anyexcl;
624  // NOTE, this minJatomStart is only stored once for the first tile list entry
625  // minJatomStart = min(minJatomStart, jatomStart);
626  }
627 
628  } // jtile
629 
630  // Write i-forces
631  storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow,
634  }
635  // Done with computation
636 
637  // Save pairlist stuff
638  if (doPairlist) {
639 
640  // Warp index (0...warpsize-1)
641  // const int wid = threadIdx.x % WARPSIZE;
642 
643  if (wid == 0) {
644  // minJatomStart is in range [0 ... atomStorageSize-1]
645  //int atom0 = (minJatomStart)/WARPSIZE;
646  // int atom0 = 0;
647  // int storageOffset = atomStorageSize/WARPSIZE;
648  // int itileListLen = 0;
649  // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
650  // Store 0 if itileListLen == 0
651  // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
652  tileListDepth[itileList] = itileListLen;
654  // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
655  if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
656  // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
657  if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
658  // NOTE: always numTileListsGBIS >= numTileLists
659  }
660 
661  typedef cub::WarpReduce<int> WarpReduceInt;
662  __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
663  int warpId = threadIdx.x / WARPSIZE;
664  // Remove indicator bit
665  nexcluded >>= 1;
666  volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
667  if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
668 
669  }
670 
671  if (doVirial) {
672  // Warp index (0...warpsize-1)
673  // const int wid = threadIdx.x % WARPSIZE;
674 
675  typedef cub::WarpReduce<float> WarpReduce;
676  __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
677  int warpId = threadIdx.x / WARPSIZE;
678  volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
680  volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
682  volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
684  if (wid == 0) {
685  virialEnergy[itileList].forcex = iforcexSum;
686  virialEnergy[itileList].forcey = iforceySum;
687  virialEnergy[itileList].forcez = iforcezSum;
688  }
689 
690  if (doSlow) {
691  iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
693  iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
695  iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
697  if (wid == 0) {
698  virialEnergy[itileList].forceSlowx = iforcexSum;
699  virialEnergy[itileList].forceSlowy = iforceySum;
700  virialEnergy[itileList].forceSlowz = iforcezSum;
701  }
702  }
703  }
704 
705  // Reduce energy
706  if (doEnergy) {
707  // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
708  // (Why does CUB suck here?)
709 #pragma unroll
710  for (int i=WARPSIZE/2;i >= 1;i/=2) {
711  energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, WARPSIZE);
712  energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, WARPSIZE);
713  if (doSlow) energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, WARPSIZE);
714  }
715 
716  if (threadIdx.x % WARPSIZE == 0) {
717  virialEnergy[itileList].energyVdw = energyVdw;
718  virialEnergy[itileList].energyElec = energyElec;
719  if (doSlow) virialEnergy[itileList].energySlow = energySlow;
720  }
721  }
722 
723  if (doStreaming) {
724  // Make sure devForces and devForcesSlow have been written into device memory
726  __threadfence();
727 
728  int patchDone[2] = {false, false};
729  const int wid = threadIdx.x % WARPSIZE;
730  if (wid == 0) {
731  int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
732  patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
733  if (patchInd.x != patchInd.y) {
734  int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
735  patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
736  }
737  }
738 
739  patchDone[0] = WARP_ANY(WARP_FULL_MASK, patchDone[0]);
740  patchDone[1] = WARP_ANY(WARP_FULL_MASK, patchDone[1]);
741 
742  if (patchDone[0]) {
743  // Patch 1 is done, write onto host-mapped memory
744  CudaPatchRecord patch = cudaPatches[patchInd.x];
745  int start = patch.atomStart;
746  int end = start + patch.numAtoms;
747  for (int i=start+wid;i < end;i+=WARPSIZE) {
748  mapForces[i] = make_float4(devForce_x[i],
749  devForce_y[i], devForce_z[i], devForce_w[i]);
750  if (doSlow){
751  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
752  devForceSlow_y[i],
753  devForceSlow_z[i],
754  devForceSlow_w[i]);
755  }
756  }
757  }
758  if (patchDone[1]) {
759  // Patch 2 is done
760  CudaPatchRecord patch = cudaPatches[patchInd.y];
761  int start = patch.atomStart;
762  int end = start + patch.numAtoms;
763  for (int i=start+wid;i < end;i+=WARPSIZE) {
764  mapForces[i] = make_float4(devForce_x[i], devForce_y[i], devForce_z[i], devForce_w[i]);
765  if (doSlow){
766  mapForcesSlow[i] = make_float4(devForceSlow_x[i],
767  devForceSlow_y[i],
768  devForceSlow_z[i],
769  devForceSlow_w[i]);
770  }
771  }
772  }
773 
774  if (patchDone[0] || patchDone[1]) {
775  // Make sure mapForces and mapForcesSlow are up-to-date
777  __threadfence_system();
778  // Add patch into "patchReadyQueue"
779  if (wid == 0) {
780  if (patchDone[0]) {
781  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
782  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
783  mapPatchReadyQueue[ind] = patchInd.x;
784  }
785  if (patchDone[1]) {
786  int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
787  // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
788  mapPatchReadyQueue[ind] = patchInd.y;
789  }
790  }
791  }
792  }
793 
794  if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
795  int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
796  outputOrder[index] = itileList;
797  }
798  } // if (itileList < numTileLists)
799 }
800 
801 //
802 // Finish up - reduce virials from nonbonded kernel
803 //
804 __global__ void reduceNonbondedVirialKernel(const bool doSlow,
805  const int atomStorageSize,
806  const float4* __restrict__ xyzq,
807  const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
808  VirialEnergy* __restrict__ virialEnergy) {
809 
810  for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
811  {
812  int i = ibase + threadIdx.x;
813 
814  // Set to zero to avoid nan*0
815  float4 pos;
816  pos.x = 0.0f;
817  pos.y = 0.0f;
818  pos.z = 0.0f;
819  float4 force, forceSlow;
820  force.x = 0.0f;
821  force.y = 0.0f;
822  force.z = 0.0f;
823  forceSlow.x = 0.0f;
824  forceSlow.y = 0.0f;
825  forceSlow.z = 0.0f;
826  if (i < atomStorageSize) {
827  pos = xyzq[i];
828  force = devForces[i];
829  if (doSlow) forceSlow = devForcesSlow[i];
830  }
831  // Reduce across the entire thread block
832  float vxxt = force.x*pos.x;
833  float vxyt = force.x*pos.y;
834  float vxzt = force.x*pos.z;
835  float vyxt = force.y*pos.x;
836  float vyyt = force.y*pos.y;
837  float vyzt = force.y*pos.z;
838  float vzxt = force.z*pos.x;
839  float vzyt = force.z*pos.y;
840  float vzzt = force.z*pos.z;
841 
842  const int bin = blockIdx.x % ATOMIC_BINS;
843 
844  // atomicAdd(&virialEnergy->virial[0], (double)vxx);
845  // atomicAdd(&virialEnergy->virial[1], (double)vxy);
846  // atomicAdd(&virialEnergy->virial[2], (double)vxz);
847  // atomicAdd(&virialEnergy->virial[3], (double)vyx);
848  // atomicAdd(&virialEnergy->virial[4], (double)vyy);
849  // atomicAdd(&virialEnergy->virial[5], (double)vyz);
850  // atomicAdd(&virialEnergy->virial[6], (double)vzx);
851  // atomicAdd(&virialEnergy->virial[7], (double)vzy);
852  // atomicAdd(&virialEnergy->virial[8], (double)vzz);
853 
854  typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
855  __shared__ typename BlockReduce::TempStorage tempStorage;
856  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
857  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
858  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
859  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
860  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
861  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
862  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
863  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
864  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
865  if (threadIdx.x == 0) {
866  atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
867  atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
868  atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
869  atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
870  atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
871  atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
872  atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
873  atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
874  atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
875  }
876 
877  if (doSlow) {
878  // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
879  float vxxSlowt = forceSlow.x*pos.x;
880  float vxySlowt = forceSlow.x*pos.y;
881  float vxzSlowt = forceSlow.x*pos.z;
882  float vyxSlowt = forceSlow.y*pos.x;
883  float vyySlowt = forceSlow.y*pos.y;
884  float vyzSlowt = forceSlow.y*pos.z;
885  float vzxSlowt = forceSlow.z*pos.x;
886  float vzySlowt = forceSlow.z*pos.y;
887  float vzzSlowt = forceSlow.z*pos.z;
888  // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
889  // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
890  // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
891  // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
892  // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
893  // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
894  // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
895  // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
896  // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
897  float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
898  float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
899  float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
900  float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
901  float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
902  float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
903  float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
904  float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
905  float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
906  if (threadIdx.x == 0) {
907  atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxxSlow);
908  atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxySlow);
909  atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxzSlow);
910  atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyxSlow);
911  atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyySlow);
912  atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyzSlow);
913  atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzxSlow);
914  atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzySlow);
915  atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzzSlow);
916  }
917  }
918 
919  }
920 
921 }
922 
923 __global__ void reduceVirialEnergyKernel(
924  const bool doEnergy, const bool doVirial, const bool doSlow,
925  const int numTileLists,
926  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
927  VirialEnergy* __restrict__ virialEnergy) {
928 
929  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
930  {
931  int itileList = ibase + threadIdx.x;
933  if (itileList < numTileLists) {
934  ve = tileListVirialEnergy[itileList];
935  } else {
936  // Set to zero to avoid nan*0
937  if (doVirial) {
938  ve.shx = 0.0f;
939  ve.shy = 0.0f;
940  ve.shz = 0.0f;
941  ve.forcex = 0.0f;
942  ve.forcey = 0.0f;
943  ve.forcez = 0.0f;
944  ve.forceSlowx = 0.0f;
945  ve.forceSlowy = 0.0f;
946  ve.forceSlowz = 0.0f;
947  }
948  if (doEnergy) {
949  ve.energyVdw = 0.0;
950  ve.energyElec = 0.0;
951  ve.energySlow = 0.0;
952  // ve.energyGBIS = 0.0;
953  }
954  }
955 
956  const int bin = blockIdx.x % ATOMIC_BINS;
957 
958  if (doVirial) {
959  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
960  __shared__ typename BlockReduce::TempStorage tempStorage;
961  float vxxt = ve.forcex*ve.shx;
962  float vxyt = ve.forcex*ve.shy;
963  float vxzt = ve.forcex*ve.shz;
964  float vyxt = ve.forcey*ve.shx;
965  float vyyt = ve.forcey*ve.shy;
966  float vyzt = ve.forcey*ve.shz;
967  float vzxt = ve.forcez*ve.shx;
968  float vzyt = ve.forcez*ve.shy;
969  float vzzt = ve.forcez*ve.shz;
970  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
971  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
972  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
973  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
974  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
975  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
976  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
977  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
978  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
979  if (threadIdx.x == 0) {
980  atomicAdd(&virialEnergy[bin].virial[0], (double)vxx);
981  atomicAdd(&virialEnergy[bin].virial[1], (double)vxy);
982  atomicAdd(&virialEnergy[bin].virial[2], (double)vxz);
983  atomicAdd(&virialEnergy[bin].virial[3], (double)vyx);
984  atomicAdd(&virialEnergy[bin].virial[4], (double)vyy);
985  atomicAdd(&virialEnergy[bin].virial[5], (double)vyz);
986  atomicAdd(&virialEnergy[bin].virial[6], (double)vzx);
987  atomicAdd(&virialEnergy[bin].virial[7], (double)vzy);
988  atomicAdd(&virialEnergy[bin].virial[8], (double)vzz);
989  }
990 
991  if (doSlow) {
992  typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
993  __shared__ typename BlockReduce::TempStorage tempStorage;
994  float vxxt = ve.forceSlowx*ve.shx;
995  float vxyt = ve.forceSlowx*ve.shy;
996  float vxzt = ve.forceSlowx*ve.shz;
997  float vyxt = ve.forceSlowy*ve.shx;
998  float vyyt = ve.forceSlowy*ve.shy;
999  float vyzt = ve.forceSlowy*ve.shz;
1000  float vzxt = ve.forceSlowz*ve.shx;
1001  float vzyt = ve.forceSlowz*ve.shy;
1002  float vzzt = ve.forceSlowz*ve.shz;
1003  float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
1004  float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
1005  float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
1006  float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
1007  float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
1008  float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
1009  float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
1010  float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
1011  float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
1012  if (threadIdx.x == 0) {
1013  atomicAdd(&virialEnergy[bin].virialSlow[0], (double)vxx);
1014  atomicAdd(&virialEnergy[bin].virialSlow[1], (double)vxy);
1015  atomicAdd(&virialEnergy[bin].virialSlow[2], (double)vxz);
1016  atomicAdd(&virialEnergy[bin].virialSlow[3], (double)vyx);
1017  atomicAdd(&virialEnergy[bin].virialSlow[4], (double)vyy);
1018  atomicAdd(&virialEnergy[bin].virialSlow[5], (double)vyz);
1019  atomicAdd(&virialEnergy[bin].virialSlow[6], (double)vzx);
1020  atomicAdd(&virialEnergy[bin].virialSlow[7], (double)vzy);
1021  atomicAdd(&virialEnergy[bin].virialSlow[8], (double)vzz);
1022  }
1023  }
1024  }
1025 
1026  if (doEnergy) {
1027  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1028  __shared__ typename BlockReduce::TempStorage tempStorage;
1029  double energyVdw = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
1030  double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
1031  if (threadIdx.x == 0) {
1032  atomicAdd(&virialEnergy[bin].energyVdw, energyVdw);
1033  atomicAdd(&virialEnergy[bin].energyElec, energyElec);
1034  }
1035  if (doSlow) {
1036  double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
1037  if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energySlow, energySlow);
1038  }
1039  // if (doGBIS) {
1040  // double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
1041  // if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
1042  // }
1043  }
1044 
1045  }
1046 
1047 }
1048 
1049 __global__ void reduceGBISEnergyKernel(const int numTileLists,
1050  const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
1051  VirialEnergy* __restrict__ virialEnergy) {
1052 
1053  for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1054  {
1055  int itileList = ibase + threadIdx.x;
1056  double energyGBISt = 0.0;
1057  if (itileList < numTileLists) {
1058  energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
1059  }
1060 
1061  const int bin = blockIdx.x % ATOMIC_BINS;
1062 
1063  typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1064  __shared__ typename BlockReduce::TempStorage tempStorage;
1065  double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
1066  if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energyGBIS, energyGBIS);
1067  }
1068 
1069 }
1070 
1072  const bool doVirial,
1073  const bool doEnergy,
1074  const bool doSlow,
1075  const bool doGBIS,
1076  VirialEnergy* __restrict__ virialEnergy) {
1077 
1078  const int bin = threadIdx.x;
1079 
1080  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
1081  __shared__ typename WarpReduce::TempStorage tempStorage;
1082 
1083  if (doVirial) {
1084  double vxx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[0]);
1085  double vxy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[1]);
1086  double vxz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[2]);
1087  double vyx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[3]);
1088  double vyy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[4]);
1089  double vyz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[5]);
1090  double vzx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[6]);
1091  double vzy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[7]);
1092  double vzz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[8]);
1093  if (threadIdx.x == 0) {
1094  virialEnergy->virial[0] = vxx;
1095  virialEnergy->virial[1] = vxy;
1096  virialEnergy->virial[2] = vxz;
1097  virialEnergy->virial[3] = vyx;
1098  virialEnergy->virial[4] = vyy;
1099  virialEnergy->virial[5] = vyz;
1100  virialEnergy->virial[6] = vzx;
1101  virialEnergy->virial[7] = vzy;
1102  virialEnergy->virial[8] = vzz;
1103  }
1104 
1105  if (doSlow) {
1106  double vxxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[0]);
1107  double vxySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[1]);
1108  double vxzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[2]);
1109  double vyxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[3]);
1110  double vyySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[4]);
1111  double vyzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[5]);
1112  double vzxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[6]);
1113  double vzySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[7]);
1114  double vzzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[8]);
1115  if (threadIdx.x == 0) {
1116  virialEnergy->virialSlow[0] = vxxSlow;
1117  virialEnergy->virialSlow[1] = vxySlow;
1118  virialEnergy->virialSlow[2] = vxzSlow;
1119  virialEnergy->virialSlow[3] = vyxSlow;
1120  virialEnergy->virialSlow[4] = vyySlow;
1121  virialEnergy->virialSlow[5] = vyzSlow;
1122  virialEnergy->virialSlow[6] = vzxSlow;
1123  virialEnergy->virialSlow[7] = vzySlow;
1124  virialEnergy->virialSlow[8] = vzzSlow;
1125  }
1126  }
1127  }
1128 
1129  if (doEnergy) {
1130  double energyVdw = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw);
1131  double energyElec = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec);
1132  if (threadIdx.x == 0) {
1133  virialEnergy->energyVdw = energyVdw;
1134  virialEnergy->energyElec = energyElec;
1135  }
1136  if (doSlow) {
1137  double energySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow);
1138  if (threadIdx.x == 0) {
1139  virialEnergy->energySlow = energySlow;
1140  }
1141  }
1142  if (doGBIS) {
1143  double energyGBIS = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyGBIS);
1144  if (threadIdx.x == 0) {
1145  virialEnergy->energyGBIS = energyGBIS;
1146  }
1147  }
1148  }
1149 }
1150 
1151 // ##############################################################################################
1152 // ##############################################################################################
1153 // ##############################################################################################
1154 
1156  bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
1157 
1158  cudaCheck(cudaSetDevice(deviceID));
1159 
1160  overflowExclusions = NULL;
1161  overflowExclusionsSize = 0;
1162 
1163  exclIndexMaxDiff = NULL;
1164  exclIndexMaxDiffSize = 0;
1165 
1166  atomIndex = NULL;
1167  atomIndexSize = 0;
1168 
1169  vdwTypes = NULL;
1170  vdwTypesSize = 0;
1171 
1172  patchNumCount = NULL;
1173  patchNumCountSize = 0;
1174 
1175  patchReadyQueue = NULL;
1176  patchReadyQueueSize = 0;
1177 
1178  force_x = force_y = force_z = force_w = NULL;
1179  forceSize = 0;
1180  forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1181  forceSlowSize = 0;
1182 }
1183 
1185 {
1186  reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
1187  reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
1188  reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
1189  reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
1190  reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
1191  reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
1192  reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
1193  reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);
1194 }
1195 
1197  cudaCheck(cudaSetDevice(deviceID));
1198  if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
1199  if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
1200  if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
1201  if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
1202  if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
1203  if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
1204  if (force_x != NULL) deallocate_device<float>(&force_x);
1205  if (force_y != NULL) deallocate_device<float>(&force_y);
1206  if (force_z != NULL) deallocate_device<float>(&force_z);
1207  if (force_w != NULL) deallocate_device<float>(&force_w);
1208  if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
1209  if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
1210  if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
1211  if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
1212 }
1213 
1214 void CudaComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
1215  const int2* h_exclIndexMaxDiff, const int* h_atomIndex, cudaStream_t stream) {
1216 
1217  reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
1218  reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
1219  reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
1220 
1221  copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
1222  copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
1223  copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
1224 }
1225 
1227  if (!doStreaming) {
1228  NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
1229  }
1230  return patchReadyQueue;
1231 }
1232 
1233 template <int doSlow>
1234 __global__ void transposeForcesKernel(float4 *f, float4 *fSlow,
1235  float *fx, float *fy, float *fz, float *fw,
1236  float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww,
1237  int n)
1238 {
1239  int tid = blockIdx.x*blockDim.x + threadIdx.x;
1240  if (tid < n) {
1241  f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
1242  if (doSlow) {
1243  fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
1244  }
1245  }
1246 }
1247 
1248 
1249 
1251  const int atomStorageSize, const bool doPairlist,
1252  const bool doEnergy, const bool doVirial, const bool doSlow,
1253  const float3 lata, const float3 latb, const float3 latc,
1254  const float4* h_xyzq, const float cutoff2,
1255  float4* d_forces, float4* d_forcesSlow,
1256  float4* h_forces, float4* h_forcesSlow,
1257  cudaStream_t stream) {
1258 
1259  if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
1260 
1261  // clear_device_array<float4>(d_forces, atomStorageSize, stream);
1262  // if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
1263 
1264 
1265  // XXX TODO: Clear all of these
1266  if(1){
1267  // two clears
1268  tlKernel.clearTileListStat(stream);
1269  clear_device_array<float>(force_x, atomStorageSize, stream);
1270  clear_device_array<float>(force_y, atomStorageSize, stream);
1271  clear_device_array<float>(force_z, atomStorageSize, stream);
1272  clear_device_array<float>(force_w, atomStorageSize, stream);
1273  if (doSlow) {
1274  clear_device_array<float>(forceSlow_x, atomStorageSize, stream);
1275  clear_device_array<float>(forceSlow_y, atomStorageSize, stream);
1276  clear_device_array<float>(forceSlow_z, atomStorageSize, stream);
1277  clear_device_array<float>(forceSlow_w, atomStorageSize, stream);
1278  }
1279  }
1280 
1281  // --- streaming ----
1282  float4* m_forces = NULL;
1283  float4* m_forcesSlow = NULL;
1284  int* m_patchReadyQueue = NULL;
1285  int numPatches = 0;
1286  unsigned int* patchNumCountPtr = NULL;
1287  if (doStreaming) {
1288  numPatches = tlKernel.getNumPatches();
1289  if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
1290  // If re-allocated, clear array
1291  clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
1292  }
1293  patchNumCountPtr = patchNumCount;
1294  bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
1295  if (re) {
1296  // If re-allocated, re-set to "-1"
1297  for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
1298  }
1299  cudaCheck(cudaHostGetDevicePointer((void**)&m_patchReadyQueue, patchReadyQueue, 0));
1300  cudaCheck(cudaHostGetDevicePointer((void**)&m_forces, h_forces, 0));
1301  cudaCheck(cudaHostGetDevicePointer((void**)&m_forcesSlow, h_forcesSlow, 0));
1302  }
1303  // -----------------
1304 
1305  if (doVirial || doEnergy) {
1306  tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
1307  }
1308 
1309  int shMemSize = 0;
1310 
1311  int* outputOrderPtr = tlKernel.getOutputOrder();
1312 
1313  int nwarp = NONBONDKERNEL_NUM_WARP;
1314  int nthread = WARPSIZE*nwarp;
1315  int start = 0;
1316  while (start < tlKernel.getNumTileLists())
1317  {
1318 
1319  int nleft = tlKernel.getNumTileLists() - start;
1320  int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
1321 
1322 #ifdef USE_TABLE_ARRAYS
1323  #define TABLE_PARAMS \
1324  cudaNonbondedTables.getForceTable(), cudaNonbondedTables.getEnergyTable()
1325 #else
1326  #define TABLE_PARAMS \
1327  cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
1328 #endif
1329 
1330 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
1331  nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
1332  <<< nblock, nthread, shMemSize, stream >>> (\
1333  start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
1334  cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), cudaNonbondedTables.getVdwCoefTableTex(), \
1335  vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
1336  TABLE_PARAMS, \
1337  tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
1338  tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
1339  tlKernel.getBoundingBoxes(), \
1340  force_x, force_y, force_z, force_w, \
1341  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
1342  numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
1343  outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
1344 
1345  bool called = false;
1346 
1347  if (doStreaming) {
1348  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
1349  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
1350  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
1351  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
1352  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
1353  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
1354  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
1355  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
1356 
1357  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 1);
1358  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 1);
1359  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 1);
1360  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 1);
1361  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 1);
1362  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 1);
1363  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 1);
1364  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 1);
1365  } else {
1366  if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
1367  if (!doEnergy && !doVirial && doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
1368  if (!doEnergy && doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
1369  if (!doEnergy && doVirial && doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
1370  if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
1371  if ( doEnergy && !doVirial && doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
1372  if ( doEnergy && doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
1373  if ( doEnergy && doVirial && doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
1374 
1375  if (!doEnergy && !doVirial && !doSlow && doPairlist) CALL(0, 0, 0, 1, 0);
1376  if (!doEnergy && !doVirial && doSlow && doPairlist) CALL(0, 0, 1, 1, 0);
1377  if (!doEnergy && doVirial && !doSlow && doPairlist) CALL(0, 1, 0, 1, 0);
1378  if (!doEnergy && doVirial && doSlow && doPairlist) CALL(0, 1, 1, 1, 0);
1379  if ( doEnergy && !doVirial && !doSlow && doPairlist) CALL(1, 0, 0, 1, 0);
1380  if ( doEnergy && !doVirial && doSlow && doPairlist) CALL(1, 0, 1, 1, 0);
1381  if ( doEnergy && doVirial && !doSlow && doPairlist) CALL(1, 1, 0, 1, 0);
1382  if ( doEnergy && doVirial && doSlow && doPairlist) CALL(1, 1, 1, 1, 0);
1383  }
1384 
1385  if (!called) {
1386  NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
1387  }
1388 
1389  {
1390  int block = 128;
1391  int grid = (atomStorageSize + block - 1)/block;
1392  if (doSlow)
1393  transposeForcesKernel<1><<<grid, block, 0, stream>>>(
1394  d_forces, d_forcesSlow,
1395  force_x, force_y, force_z, force_w,
1396  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1397  atomStorageSize);
1398  else
1399  transposeForcesKernel<0><<<grid, block, 0, stream>>>(
1400  d_forces, d_forcesSlow,
1401  force_x, force_y, force_z, force_w,
1402  forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1403  atomStorageSize);
1404  }
1405 
1406 #undef CALL
1407 #undef TABLE_PARAMS
1408  cudaCheck(cudaGetLastError());
1409 
1410  start += nblock*nwarp;
1411  }
1412 
1413 }
1414 
1415 //
1416 // Perform virial and energy reductions for non-bonded force calculation
1417 //
1419  const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
1420  float4* d_forces, float4* d_forcesSlow,
1421  VirialEnergy* d_virialEnergy, cudaStream_t stream) {
1422 
1423  if (doEnergy || doVirial) {
1424  clear_device_array<VirialEnergy>(d_virialEnergy, ATOMIC_BINS, stream);
1425  }
1426 
1427  if (doVirial)
1428  {
1430  int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
1431  reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>> (
1432  doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
1433  cudaCheck(cudaGetLastError());
1434  }
1435 
1436  if (doVirial || doEnergy)
1437  {
1439  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
1440  reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>> (
1441  doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1442  cudaCheck(cudaGetLastError());
1443  }
1444 
1445  if (doGBIS && doEnergy)
1446  {
1448  int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
1449  reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>> (
1450  tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
1451  cudaCheck(cudaGetLastError());
1452  }
1453  if (ATOMIC_BINS > 1)
1454  {
1455  // Reduce d_virialEnergy[ATOMIC_BINS] in-place (results are in d_virialEnergy[0])
1456  reduceNonbondedBinsKernel<<<1, ATOMIC_BINS, 0, stream>>>(doVirial, doEnergy, doSlow, doGBIS, d_virialEnergy);
1457  }
1458 
1459 }
1460 
1461 void CudaComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {
1462 #ifdef NAMD_CUDA
1463 // TODO-HIP: cudaMemcpyToSymbol crashes on HIP-hcc with single or multiple GPUs.
1464 // Disable it considering that using constant memory does not improve performance.
1465 // This explains some choices made above.
1466  int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
1467  cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
1468 #endif
1469  reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
1470  copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
1471 }
__global__ void reduceGBISEnergyKernel(const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
#define WARP_ALL(MASK, P)
Definition: CudaUtils.h:56
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
#define WARP_FULL_MASK
Definition: CudaUtils.h:21
void setTileListVirialEnergyLength(int len)
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ patchPairs
float x
Definition: PmeSolver.C:4
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
void clearTileListStat(cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
#define OVERALLOC
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ mapForces
__global__ void transposeForcesKernel(float4 *f, float4 *fSlow, float *fx, float *fy, float *fz, float *fw, float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww, int n)
#define WARPSIZE
Definition: CudaUtils.h:10
__device__ __forceinline__ void shuffleNext(float &w)
#define LARGE_FLOAT
#define NONBONDKERNEL_NUM_WARP
__device__ __forceinline__ float4 sampleTableTex(cudaTextureObject_t tex, float k)
void reallocate_forceSOA(int atomStorageSize)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force, T *forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts)
__global__ void reduceNonbondedVirialKernel(const bool doSlow, const int atomStorageSize, const float4 *__restrict__ xyzq, const float4 *__restrict__ devForces, const float4 *__restrict__ devForcesSlow, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ tileLists
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
unsigned int WarpMask
Definition: CudaUtils.h:11
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_w
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
float y
Definition: PmeSolver.C:4
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t forceTableTex
__global__ void reduceVirialEnergyKernel(const bool doEnergy, const bool doVirial, const bool doSlow, const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define MAX_CONST_EXCLUSIONS
#define FORCE_ENERGY_TABLE_SIZE
Definition: CudaUtils.h:19
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
#define __ldg
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ devForce_y
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
void NAMD_die(const char *err_msg)
Definition: common.C:85
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
float3 offsetXYZ
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_y
__global__ void reduceNonbondedBinsKernel(const bool doVirial, const bool doEnergy, const bool doSlow, const bool doGBIS, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex
__global__ void const int numTileLists
__shared__ union @43 tempStorage
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__device__ __forceinline__ void calcForceEnergy(const float r2, const float qi, const float qj, const float dx, const float dy, const float dz, const int vdwtypei, const int vdwtypej, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, float3 &iforce, float3 &iforceSlow, float3 &jforce, float3 &jforceSlow, float &energyVdw, float &energyElec, float &energySlow)
#define WARP_SYNC(MASK)
Definition: CudaUtils.h:59
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
TileListVirialEnergy * getTileListVirialEnergy()
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:48
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ mapForcesSlow
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ tileExcls
gridSize x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
#define CALL(DOENERGY, DOVIRIAL)
#define WARP_ANY(MASK, P)
Definition: CudaUtils.h:57
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ mapPatchReadyQueue
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ devForce_x
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP
#define REDUCEGBISENERGYKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_w
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
Definition: CudaUtils.h:54
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_z