3 #if __CUDACC_VER_MAJOR__ >= 11
6 #include <namd_cub/cub.cuh>
7 #endif //CUDACC version
9 #ifdef NAMD_HIP //NAMD_HIP
10 #include <hip/hip_runtime.h>
11 #include <hipcub/hipcub.hpp>
19 #define __thread __declspec(thread)
23 #ifdef NAMD_CUDA //Handles NVIDIA
24 #define NONBONDKERNEL_NUM_WARP 4
25 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
26 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
27 #define REDUCEGBISENERGYKERNEL_NUM_WARP 32
29 #define NONBONDKERNEL_NUM_WARP 1
30 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 4
31 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 4
32 #define REDUCEGBISENERGYKERNEL_NUM_WARP 4
35 #define OVERALLOC 1.2f
39 #define MAX_CONST_EXCLUSIONS 2048 // cache size is 8k
43 __device__ __forceinline__
46 const float x = k * (float)tableSize - 0.5f;
47 const float f = floorf(x);
48 const float a = x - f;
49 const unsigned int i = (
unsigned int)f;
50 const int i0 = i < tableSize - 1 ? i : tableSize - 1;
51 const int i1 = i0 + 1;
52 const float4 t0 = tex1Dfetch<float4>(tex, i0);
53 const float4 t1 = tex1Dfetch<float4>(tex, i1);
55 a * (t1.x - t0.x) + t0.x,
56 a * (t1.y - t0.y) + t0.y,
57 a * (t1.z - t0.z) + t0.z,
58 a * (t1.w - t0.w) + t0.w);
68 __device__ __forceinline__
69 float4 tableLookup(
const float4* table,
const float k)
72 const float x = k *
static_cast<float>(tableSize) - 0.5f;
73 const float f = floorf(x);
74 const float a = x - f;
75 const int i =
static_cast<int>(f);
76 const int i0 = max(0, min(tableSize - 1, i));
77 const int i1 = max(0, min(tableSize - 1, i + 1));
78 const float4 t0 =
__ldg(&table[i0]);
79 const float4 t1 =
__ldg(&table[i1]);
81 a * (t1.x - t0.x) + t0.x,
82 a * (t1.y - t0.y) + t0.y,
83 a * (t1.z - t0.z) + t0.z,
84 a * (t1.w - t0.w) + t0.w);
89 template<
bool doEnergy,
bool doSlow>
90 __device__ __forceinline__
92 const float dx,
const float dy,
const float dz,
94 #ifdef USE_TABLE_ARRAYS
95 const float4* __restrict__ forceTable,
const float4* __restrict__ energyTable,
100 float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
101 float& energyVdw,
float& energyElec,
float& energySlow) {
103 int vdwIndex = vdwtypej + vdwtypei;
104 #if __CUDA_ARCH__ >= 350
110 float rinv = __frsqrt_rn(r2);
121 float fSlow = qi * qj;
122 float f = ljab.
x * fi.z + ljab.
y * fi.y + fSlow * fi.x;
125 energyVdw += ljab.
x * ei.z + ljab.
y * ei.y;
126 energyElec += fSlow * ei.x;
127 if (doSlow) energySlow += fSlow * ei.w;
129 if (doSlow) fSlow *= fi.w;
142 float fxSlow = dx * fSlow;
143 float fySlow = dy * fSlow;
144 float fzSlow = dz * fSlow;
145 iforceSlow.x += fxSlow;
146 iforceSlow.y += fySlow;
147 iforceSlow.z += fzSlow;
148 jforceSlow.x -= fxSlow;
149 jforceSlow.y -= fySlow;
150 jforceSlow.z -= fzSlow;
154 template<
bool doSlow>
155 __device__ __forceinline__
156 void storeForces(
const int pos,
const float3 force,
const float3 forceSlow,
157 float* __restrict__ devForces_x,
158 float* __restrict__ devForces_y,
159 float* __restrict__ devForces_z,
160 float* __restrict__ devForcesSlow_x,
161 float* __restrict__ devForcesSlow_y,
162 float* __restrict__ devForcesSlow_z)
164 #if defined(NAMD_HIP) && ((HIP_VERSION_MAJOR == 3) && (HIP_VERSION_MINOR > 3) || (HIP_VERSION_MAJOR > 3))
165 if (force.x != 0.0f || force.y != 0.0f || force.z != 0.0f) {
166 atomicAddNoRet(&devForces_x[pos], force.x);
167 atomicAddNoRet(&devForces_y[pos], force.y);
168 atomicAddNoRet(&devForces_z[pos], force.z);
171 if (forceSlow.x != 0.0f || forceSlow.y != 0.0f || forceSlow.z != 0.0f) {
172 atomicAddNoRet(&devForcesSlow_x[pos], forceSlow.x);
173 atomicAddNoRet(&devForcesSlow_y[pos], forceSlow.y);
174 atomicAddNoRet(&devForcesSlow_z[pos], forceSlow.z);
178 atomicAdd(&devForces_x[pos], force.x);
179 atomicAdd(&devForces_y[pos], force.y);
180 atomicAdd(&devForces_z[pos], force.z);
182 atomicAdd(&devForcesSlow_x[pos], forceSlow.x);
183 atomicAdd(&devForcesSlow_y[pos], forceSlow.y);
184 atomicAdd(&devForcesSlow_z[pos], forceSlow.z);
190 template<
bool doPairlist>
191 __device__ __forceinline__
192 void shuffleNext(
float& xyzq_j_w,
int& vdwtypej,
int& jatomIndex,
int& jexclMaxdiff,
int& jexclIndex) {
202 template<
bool doPairlist>
203 __device__ __forceinline__
204 void shuffleNext(
float& xyzq_j_w,
int& vdwtypej,
int& jatomIndex) {
212 template<
bool doSlow>
213 __device__ __forceinline__
231 float dx = max(0.0f, fabsf(a.
x - b.x) - a.
wx);
232 float dy = max(0.0f, fabsf(a.
y - b.y) - a.
wy);
233 float dz = max(0.0f, fabsf(a.
z - b.z) - a.
wz);
234 float r2 = dx*dx + dy*dy + dz*dz;
238 #define LARGE_FLOAT (float)(1.0e10)
243 template <
bool doEnergy,
bool doVirial,
bool doSlow,
bool doPairlist,
bool doStreaming>
251 doPairlist ? (10) : (doEnergy ? (10) : (10) ))
253 nonbondedForceKernel(
259 const float3
lata,
const float3
latb,
const float3
latc,
260 const float4* __restrict__
xyzq,
const float cutoff2,
261 #ifdef USE_TABLE_ARRAYS
262 const float4* __restrict__ forceTable,
const float4* __restrict__ energyTable,
274 #ifdef USE_NEW_EXCL_METHOD
275 const int* __restrict__ minmaxExclAtom,
305 float energyVdw, energyElec, energySlow;
307 unsigned int itileListLen;
317 const int wid = threadIdx.x %
WARPSIZE;
318 const int iwarp = threadIdx.x /
WARPSIZE;
336 bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
338 int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
357 float4 xyzq_i =
xyzq[iatomStart + wid];
367 boundingBoxI.
x += shx;
368 boundingBoxI.
y += shy;
369 boundingBoxI.
z += shz;
373 #ifdef USE_NEW_EXCL_METHOD
374 int iatomIndex, minExclAtom, maxExclAtom;
379 #ifdef USE_NEW_EXCL_METHOD
380 iatomIndex =
atomIndex[iatomStart + wid];
381 int2 tmp = minmaxExclAtom[iatomStart + wid];
385 iatomIndex =
atomIndex[iatomStart + wid];
406 if (doSlow) energySlow = 0.0f;
414 if (doPairlist) nexcluded = 0;
419 int nloopi = min(iatomSize - iatomStart,
WARPSIZE);
420 nfreei = max(iatomFreeSize - iatomStart, 0);
437 int iexclIndex, iexclMaxdiff;
441 iexclMaxdiff = tmp.y;
444 for (
int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
449 float4 xyzq_j =
xyzq[jatomStart + wid];
454 float r2bb =
distsq(boundingBoxI, xyzq_j);
458 int vdwtypej =
vdwTypes[jatomStart + wid];
459 s_vdwtypej[iwarp][wid] = vdwtypej;
463 s_jatomIndex[iwarp][wid] =
atomIndex[jatomStart + wid];
469 int nloopj = min(jatomSize - jatomStart,
WARPSIZE);
470 nfreej = max(jatomFreeSize - jatomStart, 0);
479 s_xyzq[iwarp][wid] = xyzq_j;
482 const bool self = zeroShift && (iatomStart == jatomStart);
485 s_jforce[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
487 s_jforceSlow[iwarp][wid] = make_float3(0.0f, 0.0f, 0.0f);
491 int t = (
self) ? 1 : 0;
503 int j = (0 + wid) & modval;
504 xyzq_j = s_xyzq[iwarp][j];
505 float dx = xyzq_j.x - xyzq_i.x;
506 float dy = xyzq_j.y - xyzq_i.y;
507 float dz = xyzq_j.z - xyzq_i.z;
509 float r2 = dx*dx + dy*dy + dz*dz;
518 int j = (t + wid) & modval;
522 xyzq_j = s_xyzq[iwarp][j];
523 float dx = xyzq_j.x - xyzq_i.x;
524 float dy = xyzq_j.y - xyzq_i.y;
525 float dz = xyzq_j.z - xyzq_i.z;
526 float r2 = dx*dx + dy*dy + dz*dz;
530 if (j < nfreej || wid < nfreei) {
531 bool excluded =
false;
532 int indexdiff = s_jatomIndex[iwarp][j] - iatomIndex;
533 if ( abs(indexdiff) <= iexclMaxdiff) {
534 indexdiff += iexclIndex;
535 int indexword = ((
unsigned int) indexdiff) >> 5;
545 excluded = ((indexword & (1<<(indexdiff&31))) != 0);
547 if (excluded) nexcluded += 2;
548 if (!excluded) excl |= (
WarpMask)1 << (WARPSIZE-1);
549 if (!excluded && r2 <
cutoff2) {
550 calcForceEnergy<doEnergy, doSlow>(
551 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
552 vdwtypei, s_vdwtypej[iwarp][j],
554 #ifdef USE_TABLE_ARRAYS
555 forceTable, energyTable,
560 s_jforce[iwarp][j], s_jforceSlow[iwarp][j],
562 energyElec, energySlow);
577 int j = (wid+t) & (WARPSIZE-1);
578 xyzq_j = s_xyzq[iwarp][j];
579 float dx = xyzq_j.x - xyzq_i.x;
580 float dy = xyzq_j.y - xyzq_i.y;
581 float dz = xyzq_j.z - xyzq_i.z;
583 float r2 = dx*dx + dy*dy + dz*dz;
586 calcForceEnergy<doEnergy, doSlow>(
587 r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
588 vdwtypei, s_vdwtypej[iwarp][j],
590 #ifdef USE_TABLE_ARRAYS
591 forceTable, energyTable,
597 s_jforceSlow[iwarp][j],
598 energyVdw, energyElec, energySlow);
607 storeForces<doSlow>(jatomStart + wid, s_jforce[iwarp][wid], s_jforceSlow[iwarp][wid],
617 if (wid == 0)
jtiles[jtile] = anyexcl;
623 itileListLen += anyexcl;
631 storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow,
655 if ((itileListLen & 65535) > 0) atomicAdd(&
tileListStat->numTileLists, 1);
657 if (itileListLen > 0) atomicAdd(&
tileListStat->numTileListsGBIS, 1);
661 typedef cub::WarpReduce<int> WarpReduceInt;
663 int warpId = threadIdx.x /
WARPSIZE;
666 volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
667 if (wid == 0) atomicAdd(&
tileListStat->numExcluded, nexcludedWarp);
675 typedef cub::WarpReduce<float> WarpReduce;
677 int warpId = threadIdx.x /
WARPSIZE;
678 volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
680 volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
682 volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
685 virialEnergy[
itileList].forcex = iforcexSum;
686 virialEnergy[
itileList].forcey = iforceySum;
687 virialEnergy[
itileList].forcez = iforcezSum;
691 iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
693 iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
695 iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
698 virialEnergy[
itileList].forceSlowx = iforcexSum;
699 virialEnergy[
itileList].forceSlowy = iforceySum;
700 virialEnergy[
itileList].forceSlowz = iforcezSum;
710 for (
int i=WARPSIZE/2;i >= 1;i/=2) {
716 if (threadIdx.x % WARPSIZE == 0) {
717 virialEnergy[
itileList].energyVdw = energyVdw;
718 virialEnergy[
itileList].energyElec = energyElec;
719 if (doSlow) virialEnergy[
itileList].energySlow = energySlow;
728 int patchDone[2] = {
false,
false};
729 const int wid = threadIdx.x %
WARPSIZE;
731 int patchCountOld0 = atomicInc(&
patchNumCount[patchInd.x], (
unsigned int)(patchNumList.x-1));
732 patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
733 if (patchInd.x != patchInd.y) {
734 int patchCountOld1 = atomicInc(&
patchNumCount[patchInd.y], (
unsigned int)(patchNumList.y-1));
735 patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
747 for (
int i=start+wid;i < end;i+=
WARPSIZE) {
748 mapForces[i] = make_float4(devForce_x[i],
763 for (
int i=start+wid;i < end;i+=
WARPSIZE) {
774 if (patchDone[0] || patchDone[1]) {
777 __threadfence_system();
781 int ind = atomicAdd(&
tileListStat->patchReadyQueueCount, 1);
786 int ind = atomicAdd(&
tileListStat->patchReadyQueueCount, 1);
794 if (doStreaming &&
outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
795 int index = atomicAdd(&
tileListStat->outputOrderIndex, 1);
805 const int atomStorageSize,
806 const float4* __restrict__
xyzq,
807 const float4* __restrict__ devForces,
const float4* __restrict__ devForcesSlow,
810 for (
int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
812 int i = ibase + threadIdx.x;
819 float4 force, forceSlow;
826 if (i < atomStorageSize) {
828 force = devForces[i];
829 if (doSlow) forceSlow = devForcesSlow[i];
832 float vxxt = force.x*pos.x;
833 float vxyt = force.x*pos.y;
834 float vxzt = force.x*pos.z;
835 float vyxt = force.y*pos.x;
836 float vyyt = force.y*pos.y;
837 float vyzt = force.y*pos.z;
838 float vzxt = force.z*pos.x;
839 float vzyt = force.z*pos.y;
840 float vzzt = force.z*pos.z;
854 typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
855 __shared__
typename BlockReduce::TempStorage
tempStorage;
856 float vxx = BlockReduce(tempStorage).Sum(vxxt);
BLOCK_SYNC;
857 float vxy = BlockReduce(tempStorage).Sum(vxyt);
BLOCK_SYNC;
858 float vxz = BlockReduce(tempStorage).Sum(vxzt);
BLOCK_SYNC;
859 float vyx = BlockReduce(tempStorage).Sum(vyxt);
BLOCK_SYNC;
860 float vyy = BlockReduce(tempStorage).Sum(vyyt);
BLOCK_SYNC;
861 float vyz = BlockReduce(tempStorage).Sum(vyzt);
BLOCK_SYNC;
862 float vzx = BlockReduce(tempStorage).Sum(vzxt);
BLOCK_SYNC;
863 float vzy = BlockReduce(tempStorage).Sum(vzyt);
BLOCK_SYNC;
864 float vzz = BlockReduce(tempStorage).Sum(vzzt);
BLOCK_SYNC;
865 if (threadIdx.x == 0) {
866 atomicAdd(&virialEnergy[bin].virial[0], (
double)vxx);
867 atomicAdd(&virialEnergy[bin].virial[1], (
double)vxy);
868 atomicAdd(&virialEnergy[bin].virial[2], (
double)vxz);
869 atomicAdd(&virialEnergy[bin].virial[3], (
double)vyx);
870 atomicAdd(&virialEnergy[bin].virial[4], (
double)vyy);
871 atomicAdd(&virialEnergy[bin].virial[5], (
double)vyz);
872 atomicAdd(&virialEnergy[bin].virial[6], (
double)vzx);
873 atomicAdd(&virialEnergy[bin].virial[7], (
double)vzy);
874 atomicAdd(&virialEnergy[bin].virial[8], (
double)vzz);
879 float vxxSlowt = forceSlow.x*pos.x;
880 float vxySlowt = forceSlow.x*pos.y;
881 float vxzSlowt = forceSlow.x*pos.z;
882 float vyxSlowt = forceSlow.y*pos.x;
883 float vyySlowt = forceSlow.y*pos.y;
884 float vyzSlowt = forceSlow.y*pos.z;
885 float vzxSlowt = forceSlow.z*pos.x;
886 float vzySlowt = forceSlow.z*pos.y;
887 float vzzSlowt = forceSlow.z*pos.z;
897 float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt);
BLOCK_SYNC;
898 float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt);
BLOCK_SYNC;
899 float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt);
BLOCK_SYNC;
900 float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt);
BLOCK_SYNC;
901 float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt);
BLOCK_SYNC;
902 float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt);
BLOCK_SYNC;
903 float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt);
BLOCK_SYNC;
904 float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt);
BLOCK_SYNC;
905 float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt);
BLOCK_SYNC;
906 if (threadIdx.x == 0) {
907 atomicAdd(&virialEnergy[bin].virialSlow[0], (
double)vxxSlow);
908 atomicAdd(&virialEnergy[bin].virialSlow[1], (
double)vxySlow);
909 atomicAdd(&virialEnergy[bin].virialSlow[2], (
double)vxzSlow);
910 atomicAdd(&virialEnergy[bin].virialSlow[3], (
double)vyxSlow);
911 atomicAdd(&virialEnergy[bin].virialSlow[4], (
double)vyySlow);
912 atomicAdd(&virialEnergy[bin].virialSlow[5], (
double)vyzSlow);
913 atomicAdd(&virialEnergy[bin].virialSlow[6], (
double)vzxSlow);
914 atomicAdd(&virialEnergy[bin].virialSlow[7], (
double)vzySlow);
915 atomicAdd(&virialEnergy[bin].virialSlow[8], (
double)vzzSlow);
924 const bool doEnergy,
const bool doVirial,
const bool doSlow,
929 for (
int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
933 if (itileList < numTileLists) {
959 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
960 __shared__
typename BlockReduce::TempStorage
tempStorage;
970 float vxx = BlockReduce(tempStorage).Sum(vxxt);
BLOCK_SYNC;
971 float vxy = BlockReduce(tempStorage).Sum(vxyt);
BLOCK_SYNC;
972 float vxz = BlockReduce(tempStorage).Sum(vxzt);
BLOCK_SYNC;
973 float vyx = BlockReduce(tempStorage).Sum(vyxt);
BLOCK_SYNC;
974 float vyy = BlockReduce(tempStorage).Sum(vyyt);
BLOCK_SYNC;
975 float vyz = BlockReduce(tempStorage).Sum(vyzt);
BLOCK_SYNC;
976 float vzx = BlockReduce(tempStorage).Sum(vzxt);
BLOCK_SYNC;
977 float vzy = BlockReduce(tempStorage).Sum(vzyt);
BLOCK_SYNC;
978 float vzz = BlockReduce(tempStorage).Sum(vzzt);
BLOCK_SYNC;
979 if (threadIdx.x == 0) {
980 atomicAdd(&virialEnergy[bin].virial[0], (
double)vxx);
981 atomicAdd(&virialEnergy[bin].virial[1], (
double)vxy);
982 atomicAdd(&virialEnergy[bin].virial[2], (
double)vxz);
983 atomicAdd(&virialEnergy[bin].virial[3], (
double)vyx);
984 atomicAdd(&virialEnergy[bin].virial[4], (
double)vyy);
985 atomicAdd(&virialEnergy[bin].virial[5], (
double)vyz);
986 atomicAdd(&virialEnergy[bin].virial[6], (
double)vzx);
987 atomicAdd(&virialEnergy[bin].virial[7], (
double)vzy);
988 atomicAdd(&virialEnergy[bin].virial[8], (
double)vzz);
992 typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
993 __shared__
typename BlockReduce::TempStorage
tempStorage;
1003 float vxx = BlockReduce(tempStorage).Sum(vxxt);
BLOCK_SYNC;
1004 float vxy = BlockReduce(tempStorage).Sum(vxyt);
BLOCK_SYNC;
1005 float vxz = BlockReduce(tempStorage).Sum(vxzt);
BLOCK_SYNC;
1006 float vyx = BlockReduce(tempStorage).Sum(vyxt);
BLOCK_SYNC;
1007 float vyy = BlockReduce(tempStorage).Sum(vyyt);
BLOCK_SYNC;
1008 float vyz = BlockReduce(tempStorage).Sum(vyzt);
BLOCK_SYNC;
1009 float vzx = BlockReduce(tempStorage).Sum(vzxt);
BLOCK_SYNC;
1010 float vzy = BlockReduce(tempStorage).Sum(vzyt);
BLOCK_SYNC;
1011 float vzz = BlockReduce(tempStorage).Sum(vzzt);
BLOCK_SYNC;
1012 if (threadIdx.x == 0) {
1013 atomicAdd(&virialEnergy[bin].virialSlow[0], (
double)vxx);
1014 atomicAdd(&virialEnergy[bin].virialSlow[1], (
double)vxy);
1015 atomicAdd(&virialEnergy[bin].virialSlow[2], (
double)vxz);
1016 atomicAdd(&virialEnergy[bin].virialSlow[3], (
double)vyx);
1017 atomicAdd(&virialEnergy[bin].virialSlow[4], (
double)vyy);
1018 atomicAdd(&virialEnergy[bin].virialSlow[5], (
double)vyz);
1019 atomicAdd(&virialEnergy[bin].virialSlow[6], (
double)vzx);
1020 atomicAdd(&virialEnergy[bin].virialSlow[7], (
double)vzy);
1021 atomicAdd(&virialEnergy[bin].virialSlow[8], (
double)vzz);
1027 typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1028 __shared__
typename BlockReduce::TempStorage
tempStorage;
1031 if (threadIdx.x == 0) {
1032 atomicAdd(&virialEnergy[bin].energyVdw, energyVdw);
1033 atomicAdd(&virialEnergy[bin].energyElec, energyElec);
1037 if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energySlow, energySlow);
1053 for (
int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
1056 double energyGBISt = 0.0;
1057 if (itileList < numTileLists) {
1058 energyGBISt = tileListVirialEnergy[
itileList].energyGBIS;
1063 typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
1064 __shared__
typename BlockReduce::TempStorage
tempStorage;
1065 double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt);
BLOCK_SYNC;
1066 if (threadIdx.x == 0) atomicAdd(&virialEnergy[bin].energyGBIS, energyGBIS);
1072 const bool doVirial,
1073 const bool doEnergy,
1078 const int bin = threadIdx.x;
1080 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ?
ATOMIC_BINS : 2)> WarpReduce;
1081 __shared__
typename WarpReduce::TempStorage
tempStorage;
1084 double vxx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[0]);
1085 double vxy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[1]);
1086 double vxz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[2]);
1087 double vyx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[3]);
1088 double vyy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[4]);
1089 double vyz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[5]);
1090 double vzx = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[6]);
1091 double vzy = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[7]);
1092 double vzz = WarpReduce(tempStorage).Sum(virialEnergy[bin].virial[8]);
1093 if (threadIdx.x == 0) {
1094 virialEnergy->virial[0] = vxx;
1095 virialEnergy->virial[1] = vxy;
1096 virialEnergy->virial[2] = vxz;
1097 virialEnergy->virial[3] = vyx;
1098 virialEnergy->virial[4] = vyy;
1099 virialEnergy->virial[5] = vyz;
1100 virialEnergy->virial[6] = vzx;
1101 virialEnergy->virial[7] = vzy;
1102 virialEnergy->virial[8] = vzz;
1106 double vxxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[0]);
1107 double vxySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[1]);
1108 double vxzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[2]);
1109 double vyxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[3]);
1110 double vyySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[4]);
1111 double vyzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[5]);
1112 double vzxSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[6]);
1113 double vzySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[7]);
1114 double vzzSlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].virialSlow[8]);
1115 if (threadIdx.x == 0) {
1116 virialEnergy->virialSlow[0] = vxxSlow;
1117 virialEnergy->virialSlow[1] = vxySlow;
1118 virialEnergy->virialSlow[2] = vxzSlow;
1119 virialEnergy->virialSlow[3] = vyxSlow;
1120 virialEnergy->virialSlow[4] = vyySlow;
1121 virialEnergy->virialSlow[5] = vyzSlow;
1122 virialEnergy->virialSlow[6] = vzxSlow;
1123 virialEnergy->virialSlow[7] = vzySlow;
1124 virialEnergy->virialSlow[8] = vzzSlow;
1130 double energyVdw = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyVdw);
1131 double energyElec = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyElec);
1132 if (threadIdx.x == 0) {
1133 virialEnergy->energyVdw = energyVdw;
1134 virialEnergy->energyElec = energyElec;
1137 double energySlow = WarpReduce(tempStorage).Sum(virialEnergy[bin].energySlow);
1138 if (threadIdx.x == 0) {
1139 virialEnergy->energySlow = energySlow;
1143 double energyGBIS = WarpReduce(tempStorage).Sum(virialEnergy[bin].energyGBIS);
1144 if (threadIdx.x == 0) {
1145 virialEnergy->energyGBIS = energyGBIS;
1156 bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
1160 overflowExclusions = NULL;
1161 overflowExclusionsSize = 0;
1163 exclIndexMaxDiff = NULL;
1164 exclIndexMaxDiffSize = 0;
1172 patchNumCount = NULL;
1173 patchNumCountSize = 0;
1175 patchReadyQueue = NULL;
1176 patchReadyQueueSize = 0;
1178 force_x = force_y = force_z = force_w = NULL;
1180 forceSlow_x = forceSlow_y = forceSlow_z = forceSlow_w = NULL;
1186 reallocate_device<float>(&force_x, &forceSize, atomStorageSize, 1.4f);
1187 reallocate_device<float>(&force_y, &forceSize, atomStorageSize, 1.4f);
1188 reallocate_device<float>(&force_z, &forceSize, atomStorageSize, 1.4f);
1189 reallocate_device<float>(&force_w, &forceSize, atomStorageSize, 1.4f);
1190 reallocate_device<float>(&forceSlow_x, &forceSlowSize, atomStorageSize, 1.4f);
1191 reallocate_device<float>(&forceSlow_y, &forceSlowSize, atomStorageSize, 1.4f);
1192 reallocate_device<float>(&forceSlow_z, &forceSlowSize, atomStorageSize, 1.4f);
1193 reallocate_device<float>(&forceSlow_w, &forceSlowSize, atomStorageSize, 1.4f);
1198 if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
1199 if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
1200 if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
1201 if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
1202 if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
1203 if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
1204 if (force_x != NULL) deallocate_device<float>(&force_x);
1205 if (force_y != NULL) deallocate_device<float>(&force_y);
1206 if (force_z != NULL) deallocate_device<float>(&force_z);
1207 if (force_w != NULL) deallocate_device<float>(&force_w);
1208 if (forceSlow_x != NULL) deallocate_device<float>(&forceSlow_x);
1209 if (forceSlow_y != NULL) deallocate_device<float>(&forceSlow_y);
1210 if (forceSlow_z != NULL) deallocate_device<float>(&forceSlow_z);
1211 if (forceSlow_w != NULL) deallocate_device<float>(&forceSlow_w);
1215 const int2* h_exclIndexMaxDiff,
const int* h_atomIndex, cudaStream_t
stream) {
1217 reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize,
OVERALLOC);
1218 reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize,
OVERALLOC);
1219 reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize,
OVERALLOC);
1221 copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize,
stream);
1222 copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize,
stream);
1223 copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize,
stream);
1228 NAMD_die(
"CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
1230 return patchReadyQueue;
1233 template <
int doSlow>
1235 float *fx,
float *fy,
float *fz,
float *fw,
1236 float *fSlowx,
float *fSlowy,
float *fSlowz,
float *fSloww,
1239 int tid = blockIdx.x*blockDim.x + threadIdx.x;
1241 f[tid] = make_float4(fx[tid], fy[tid], fz[tid], fw[tid]);
1243 fSlow[tid] = make_float4(fSlowx[tid], fSlowy[tid], fSlowz[tid], fSloww[tid]);
1251 const int atomStorageSize,
const bool doPairlist,
1252 const bool doEnergy,
const bool doVirial,
const bool doSlow,
1253 const float3
lata,
const float3
latb,
const float3
latc,
1254 const float4* h_xyzq,
const float cutoff2,
1255 float4* d_forces, float4* d_forcesSlow,
1256 float4* h_forces, float4* h_forcesSlow,
1259 if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.
get_xyzq(), atomStorageSize,
stream);
1269 clear_device_array<float>(force_x, atomStorageSize,
stream);
1270 clear_device_array<float>(force_y, atomStorageSize,
stream);
1271 clear_device_array<float>(force_z, atomStorageSize,
stream);
1272 clear_device_array<float>(force_w, atomStorageSize,
stream);
1274 clear_device_array<float>(forceSlow_x, atomStorageSize,
stream);
1275 clear_device_array<float>(forceSlow_y, atomStorageSize,
stream);
1276 clear_device_array<float>(forceSlow_z, atomStorageSize,
stream);
1277 clear_device_array<float>(forceSlow_w, atomStorageSize,
stream);
1282 float4* m_forces = NULL;
1283 float4* m_forcesSlow = NULL;
1284 int* m_patchReadyQueue = NULL;
1286 unsigned int* patchNumCountPtr = NULL;
1289 if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
1293 patchNumCountPtr = patchNumCount;
1294 bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize,
numPatches, cudaHostAllocMapped);
1297 for (
int i=0;i <
numPatches;i++) patchReadyQueue[i] = -1;
1299 cudaCheck(cudaHostGetDevicePointer((
void**)&m_patchReadyQueue, patchReadyQueue, 0));
1300 cudaCheck(cudaHostGetDevicePointer((
void**)&m_forces, h_forces, 0));
1301 cudaCheck(cudaHostGetDevicePointer((
void**)&m_forcesSlow, h_forcesSlow, 0));
1305 if (doVirial || doEnergy) {
1314 int nthread = WARPSIZE*nwarp;
1322 #ifdef USE_TABLE_ARRAYS
1323 #define TABLE_PARAMS \
1324 cudaNonbondedTables.getForceTable(), cudaNonbondedTables.getEnergyTable()
1326 #define TABLE_PARAMS \
1327 cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex()
1330 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
1331 nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
1332 <<< nblock, nthread, shMemSize, stream >>> (\
1333 start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
1334 cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), cudaNonbondedTables.getVdwCoefTableTex(), \
1335 vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
1337 tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
1338 tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
1339 tlKernel.getBoundingBoxes(), \
1340 force_x, force_y, force_z, force_w, \
1341 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w, \
1342 numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
1343 outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
1345 bool called =
false;
1348 if (!doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(0, 0, 0, 0, 1);
1349 if (!doEnergy && !doVirial && doSlow && !doPairlist)
CALL(0, 0, 1, 0, 1);
1350 if (!doEnergy && doVirial && !doSlow && !doPairlist)
CALL(0, 1, 0, 0, 1);
1351 if (!doEnergy && doVirial && doSlow && !doPairlist)
CALL(0, 1, 1, 0, 1);
1352 if ( doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(1, 0, 0, 0, 1);
1353 if ( doEnergy && !doVirial && doSlow && !doPairlist)
CALL(1, 0, 1, 0, 1);
1354 if ( doEnergy && doVirial && !doSlow && !doPairlist)
CALL(1, 1, 0, 0, 1);
1355 if ( doEnergy && doVirial && doSlow && !doPairlist)
CALL(1, 1, 1, 0, 1);
1357 if (!doEnergy && !doVirial && !doSlow && doPairlist)
CALL(0, 0, 0, 1, 1);
1358 if (!doEnergy && !doVirial && doSlow && doPairlist)
CALL(0, 0, 1, 1, 1);
1359 if (!doEnergy && doVirial && !doSlow && doPairlist)
CALL(0, 1, 0, 1, 1);
1360 if (!doEnergy && doVirial && doSlow && doPairlist)
CALL(0, 1, 1, 1, 1);
1361 if ( doEnergy && !doVirial && !doSlow && doPairlist)
CALL(1, 0, 0, 1, 1);
1362 if ( doEnergy && !doVirial && doSlow && doPairlist)
CALL(1, 0, 1, 1, 1);
1363 if ( doEnergy && doVirial && !doSlow && doPairlist)
CALL(1, 1, 0, 1, 1);
1364 if ( doEnergy && doVirial && doSlow && doPairlist)
CALL(1, 1, 1, 1, 1);
1366 if (!doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(0, 0, 0, 0, 0);
1367 if (!doEnergy && !doVirial && doSlow && !doPairlist)
CALL(0, 0, 1, 0, 0);
1368 if (!doEnergy && doVirial && !doSlow && !doPairlist)
CALL(0, 1, 0, 0, 0);
1369 if (!doEnergy && doVirial && doSlow && !doPairlist)
CALL(0, 1, 1, 0, 0);
1370 if ( doEnergy && !doVirial && !doSlow && !doPairlist)
CALL(1, 0, 0, 0, 0);
1371 if ( doEnergy && !doVirial && doSlow && !doPairlist)
CALL(1, 0, 1, 0, 0);
1372 if ( doEnergy && doVirial && !doSlow && !doPairlist)
CALL(1, 1, 0, 0, 0);
1373 if ( doEnergy && doVirial && doSlow && !doPairlist)
CALL(1, 1, 1, 0, 0);
1375 if (!doEnergy && !doVirial && !doSlow && doPairlist)
CALL(0, 0, 0, 1, 0);
1376 if (!doEnergy && !doVirial && doSlow && doPairlist)
CALL(0, 0, 1, 1, 0);
1377 if (!doEnergy && doVirial && !doSlow && doPairlist)
CALL(0, 1, 0, 1, 0);
1378 if (!doEnergy && doVirial && doSlow && doPairlist)
CALL(0, 1, 1, 1, 0);
1379 if ( doEnergy && !doVirial && !doSlow && doPairlist)
CALL(1, 0, 0, 1, 0);
1380 if ( doEnergy && !doVirial && doSlow && doPairlist)
CALL(1, 0, 1, 1, 0);
1381 if ( doEnergy && doVirial && !doSlow && doPairlist)
CALL(1, 1, 0, 1, 0);
1382 if ( doEnergy && doVirial && doSlow && doPairlist)
CALL(1, 1, 1, 1, 0);
1386 NAMD_die(
"CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
1391 int grid = (atomStorageSize + block - 1)/block;
1393 transposeForcesKernel<1><<<grid, block, 0, stream>>>(
1394 d_forces, d_forcesSlow,
1395 force_x, force_y, force_z, force_w,
1396 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1399 transposeForcesKernel<0><<<grid, block, 0, stream>>>(
1400 d_forces, d_forcesSlow,
1401 force_x, force_y, force_z, force_w,
1402 forceSlow_x, forceSlow_y, forceSlow_z, forceSlow_w,
1410 start += nblock*nwarp;
1419 const int atomStorageSize,
const bool doEnergy,
const bool doVirial,
const bool doSlow,
const bool doGBIS,
1420 float4* d_forces, float4* d_forcesSlow,
1423 if (doEnergy || doVirial) {
1431 reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>> (
1432 doSlow, atomStorageSize, tlKernel.
get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
1436 if (doVirial || doEnergy)
1440 reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>> (
1445 if (doGBIS && doEnergy)
1449 reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>> (
1456 reduceNonbondedBinsKernel<<<1, ATOMIC_BINS, 0, stream>>>(doVirial, doEnergy, doSlow, doGBIS, d_virialEnergy);
1469 reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
1470 copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
__global__ void reduceGBISEnergyKernel(const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
#define WARP_ALL(MASK, P)
void nonbondedForce(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doPairlist, const bool doEnergy, const bool doVirial, const bool doSlow, const float3 lata, const float3 latb, const float3 latc, const float4 *h_xyzq, const float cutoff2, float4 *d_forces, float4 *d_forcesSlow, float4 *h_forces, float4 *h_forcesSlow, cudaStream_t stream)
CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
void setTileListVirialEnergyLength(int len)
void updateVdwTypesExcl(const int atomStorageSize, const int *h_vdwTypes, const int2 *h_exclIndexMaxDiff, const int *h_atomIndex, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ patchPairs
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_x
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
void clearTileListStat(cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ tileListStat
int getTileListVirialEnergyGBISLength()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ tileJatomStart
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ mapForces
__global__ void transposeForcesKernel(float4 *f, float4 *fSlow, float *fx, float *fy, float *fz, float *fw, float *fSlowx, float *fSlowy, float *fSlowz, float *fSloww, int n)
~CudaComputeNonbondedKernel()
__device__ __forceinline__ void shuffleNext(float &w)
#define NONBONDKERNEL_NUM_WARP
__device__ __forceinline__ float4 sampleTableTex(cudaTextureObject_t tex, float k)
void reallocate_forceSOA(int atomStorageSize)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
__thread cudaStream_t stream
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
__forceinline__ __device__ void storeForces(const T fx, const T fy, const T fz, const int ind, const int stride, T *force, T *forceList, int *forceListCounter, int *forceListStarts, int *forceListNexts)
__global__ void reduceNonbondedVirialKernel(const bool doSlow, const int atomStorageSize, const float4 *__restrict__ xyzq, const float4 *__restrict__ devForces, const float4 *__restrict__ devForcesSlow, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ tileLists
void bindExclusions(int numExclusions, unsigned int *exclusion_bits)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ jtiles
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_w
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ tileListOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ patchNumCount
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t forceTableTex
__global__ void reduceVirialEnergyKernel(const bool doEnergy, const bool doVirial, const bool doSlow, const int numTileLists, const TileListVirialEnergy *__restrict__ tileListVirialEnergy, VirialEnergy *__restrict__ virialEnergy)
__device__ __forceinline__ float distsq(const BoundingBox a, const float4 b)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int vdwCoefTableWidth
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ overflowExclusions
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ vdwCoefTable
#define MAX_CONST_EXCLUSIONS
#define FORCE_ENERGY_TABLE_SIZE
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ devForce_y
int getTileListVirialEnergyLength()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
void NAMD_die(const char *err_msg)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ xyzq
int * getPatchReadyQueue()
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ boundingBoxes
__constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS]
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_y
__global__ void reduceNonbondedBinsKernel(const bool doVirial, const bool doEnergy, const bool doSlow, const bool doGBIS, VirialEnergy *__restrict__ virialEnergy)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ outputOrder
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t energyTableTex
__global__ void const int numTileLists
__shared__ union @43 tempStorage
#define REDUCEVIRIALENERGYKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ tileListDepth
__device__ __forceinline__ void calcForceEnergy(const float r2, const float qi, const float qj, const float dx, const float dy, const float dz, const int vdwtypei, const int vdwtypej, const float2 *__restrict__ vdwCoefTable, cudaTextureObject_t vdwCoefTableTex, cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex, float3 &iforce, float3 &iforceSlow, float3 &jforce, float3 &jforceSlow, float &energyVdw, float &energyElec, float &energySlow)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
TileListVirialEnergy * getTileListVirialEnergy()
__thread DeviceCUDA * deviceCUDA
#define WARP_SHUFFLE_XOR(MASK, VAR, LANE, SIZE)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ mapForcesSlow
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ tileExcls
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t vdwCoefTableTex
#define CALL(DOENERGY, DOVIRIAL)
#define WARP_ANY(MASK, P)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForceSlow_z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ mapPatchReadyQueue
__global__ void __launch_bounds__(WARPSIZE *NONBONDKERNEL_NUM_WARP, doPairlist?(10):(doEnergy?(10):(10))) nonbondedForceKernel(const int start
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ devForce_x
#define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP
#define REDUCEGBISENERGYKERNEL_NUM_WARP
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_w
#define WARP_SHUFFLE(MASK, VAR, LANE, SIZE)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ devForce_z