CudaComputeNonbondedKernel.cu

Go to the documentation of this file.
00001 #include <cuda.h>
00002 #include <cub/cub.cuh>
00003 #include "CudaComputeNonbondedKernel.h"
00004 #include "CudaTileListKernel.h"
00005 #include "DeviceCUDA.h"
00006 #ifdef WIN32
00007 #define __thread __declspec(thread)
00008 #endif
00009 extern __thread DeviceCUDA *deviceCUDA;
00010 
00011 #define OVERALLOC 1.2f
00012 
00013 void NAMD_die(const char *);
00014 
00015 #define MAX_CONST_EXCLUSIONS 2048  // cache size is 8k
00016 __constant__ unsigned int constExclusions[MAX_CONST_EXCLUSIONS];
00017 
00018 #define NONBONDKERNEL_NUM_WARP 4
00019 
00020 template<bool doEnergy, bool doSlow>
00021 __device__ __forceinline__
00022 void calcForceEnergy(const float r2, const float qi, const float qj,
00023   const float dx, const float dy, const float dz,
00024   const int vdwtypei, const int vdwtypej, const float2* __restrict__ vdwCoefTable,
00025   cudaTextureObject_t vdwCoefTableTex, 
00026   cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
00027   float3& iforce, float3& iforceSlow, float3& jforce, float3& jforceSlow,
00028   float& energyVdw, float& energyElec, float& energySlow) {
00029 
00030   int vdwIndex = vdwtypej + vdwtypei;
00031 #if __CUDA_ARCH__ >= 350
00032   float2 ljab = __ldg(&vdwCoefTable[vdwIndex]);
00033 #else
00034   float2 ljab = tex1D<float2>(vdwCoefTableTex, vdwIndex);
00035 #endif
00036 
00037   float rinv = rsqrtf(r2);
00038   float4 ei;
00039   float4 fi = tex1D<float4>(forceTableTex, rinv);
00040   if (doEnergy) ei = tex1D<float4>(energyTableTex, rinv);
00041 
00042   float fSlow = qi * qj;
00043   float f = ljab.x * fi.z + ljab.y * fi.y + fSlow * fi.x;
00044 
00045   if (doEnergy) {
00046     energyVdw  += ljab.x * ei.z + ljab.y * ei.y;
00047     energyElec += fSlow * ei.x;
00048     if (doSlow) energySlow += fSlow * ei.w;
00049   }
00050   if (doSlow) fSlow *= fi.w;
00051 
00052   float fx = dx * f;
00053   float fy = dy * f;
00054   float fz = dz * f;
00055   iforce.x += fx;
00056   iforce.y += fy;
00057   iforce.z += fz;
00058   jforce.x -= fx;
00059   jforce.y -= fy;
00060   jforce.z -= fz;
00061 
00062   if (doSlow) {
00063     float fxSlow = dx * fSlow;
00064     float fySlow = dy * fSlow;
00065     float fzSlow = dz * fSlow;
00066     iforceSlow.x += fxSlow;
00067     iforceSlow.y += fySlow;
00068     iforceSlow.z += fzSlow;
00069     jforceSlow.x -= fxSlow;
00070     jforceSlow.y -= fySlow;
00071     jforceSlow.z -= fzSlow;
00072   }
00073 }
00074 
00075 template<bool doSlow>
00076 __device__ __forceinline__
00077 void storeForces(const int pos, const float3 force, const float3 forceSlow,
00078   float4* __restrict__ devForces, float4* __restrict__ devForcesSlow) {
00079   atomicAdd(&devForces[pos].x, force.x);
00080   atomicAdd(&devForces[pos].y, force.y);
00081   atomicAdd(&devForces[pos].z, force.z);
00082   if (doSlow) {
00083     atomicAdd(&devForcesSlow[pos].x, forceSlow.x);
00084     atomicAdd(&devForcesSlow[pos].y, forceSlow.y);
00085     atomicAdd(&devForcesSlow[pos].z, forceSlow.z);
00086   }
00087 }
00088 
00089 template<bool doSlow>
00090 __device__ __forceinline__
00091 void storeForces(const int pos, const float3 force, const float3 forceSlow,
00092   float3* __restrict__ forces, float3* __restrict__ forcesSlow) {
00093   atomicAdd(&forces[pos].x, force.x);
00094   atomicAdd(&forces[pos].y, force.y);
00095   atomicAdd(&forces[pos].z, force.z);
00096   if (doSlow) {
00097     atomicAdd(&forcesSlow[pos].x, forceSlow.x);
00098     atomicAdd(&forcesSlow[pos].y, forceSlow.y);
00099     atomicAdd(&forcesSlow[pos].z, forceSlow.z);
00100   }
00101 }
00102 
00103 template<bool doPairlist>
00104 __device__ __forceinline__
00105 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex, int& jexclMaxdiff, int& jexclIndex) {
00106   xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00107   vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00108   if (doPairlist) {
00109     jatomIndex   = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);    
00110     jexclIndex   = WARP_SHUFFLE(WARP_FULL_MASK, jexclIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00111     jexclMaxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jexclMaxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00112   }
00113 }
00114 
00115 template<bool doPairlist>
00116 __device__ __forceinline__
00117 void shuffleNext(float& xyzq_j_w, int& vdwtypej, int& jatomIndex) {
00118   xyzq_j_w = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j_w, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00119   vdwtypej = WARP_SHUFFLE(WARP_FULL_MASK, vdwtypej, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00120   if (doPairlist) {
00121     jatomIndex   = WARP_SHUFFLE(WARP_FULL_MASK, jatomIndex, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);    
00122   }
00123 }
00124 
00125 template<bool doSlow>
00126 __device__ __forceinline__
00127 void shuffleNext(float3& jforce, float3& jforceSlow) {
00128   jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00129   jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00130   jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00131   if (doSlow) {
00132     jforceSlow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00133     jforceSlow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00134     jforceSlow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforceSlow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00135   }
00136 }
00137 
00138 //#define USE_NEW_EXCL_METHOD
00139 
00140 //
00141 // Returns the lower estimate for the distance between a bounding box and a set of atoms
00142 //
00143 __device__ __forceinline__ float distsq(const BoundingBox a, const float4 b) {
00144   float dx = max(0.0f, fabsf(a.x - b.x) - a.wx);
00145   float dy = max(0.0f, fabsf(a.y - b.y) - a.wy);
00146   float dz = max(0.0f, fabsf(a.z - b.z) - a.wz);
00147   float r2 = dx*dx + dy*dy + dz*dz;
00148   return r2;
00149 }
00150 
00151 #define LARGE_FLOAT (float)(1.0e10)
00152 
00153 //
00154 // Nonbonded force kernel
00155 //
00156 template <bool doEnergy, bool doVirial, bool doSlow, bool doPairlist, bool doStreaming>
00157 __global__ void
00158 __launch_bounds__(WARPSIZE*NONBONDKERNEL_NUM_WARP,
00159   doPairlist ? (10) : (doEnergy ? (10) : (10) )
00160   )
00161 nonbondedForceKernel(const int start, const int numTileLists,
00162   const TileList* __restrict__ tileLists, TileExcl* __restrict__ tileExcls,
00163   const int* __restrict__ tileJatomStart,
00164   const int vdwCoefTableWidth, const float2* __restrict__ vdwCoefTable, const int* __restrict__ vdwTypes,
00165   const float3 lata, const float3 latb, const float3 latc,
00166   const float4* __restrict__ xyzq, const float cutoff2,
00167   cudaTextureObject_t vdwCoefTableTex,
00168   cudaTextureObject_t forceTableTex, cudaTextureObject_t energyTableTex,
00169   // ----------
00170   // doPairlist
00171   const int atomStorageSize, const float plcutoff2, const PatchPairRecord* __restrict__ patchPairs,
00172   const int* __restrict__ atomIndex,
00173   const int2* __restrict__ exclIndexMaxDiff, const unsigned int* __restrict__ overflowExclusions,
00174   unsigned int* __restrict__ tileListDepth, int* __restrict__ tileListOrder,
00175   int* __restrict__ jtiles, TileListStat* __restrict__ tileListStat,
00176   const BoundingBox* __restrict__ boundingBoxes,
00177 #ifdef USE_NEW_EXCL_METHOD
00178   const int* __restrict__ minmaxExclAtom,
00179 #endif
00180   // ----------
00181   float4* __restrict__ devForces, float4* __restrict__ devForcesSlow,
00182   // ---- USE_STREAMING_FORCES ----
00183   const int numPatches,
00184   unsigned int* __restrict__ patchNumCount,
00185   const CudaPatchRecord* __restrict__ cudaPatches,
00186   float4* __restrict__ mapForces, float4* __restrict__ mapForcesSlow,
00187   int* __restrict__ mapPatchReadyQueue,
00188   int* __restrict__ outputOrder,
00189   // ------------------------------
00190   TileListVirialEnergy* __restrict__ virialEnergy) {
00191 
00192   // Single warp takes care of one list of tiles
00193   // for (int itileList = (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;itileList < numTileLists;itileList += blockDim.x*gridDim.x/WARPSIZE)
00194   int itileList = start + (threadIdx.x + blockDim.x*blockIdx.x)/WARPSIZE;
00195   if (itileList < numTileLists)
00196   {
00197 
00198     float3 iforce;
00199     float3 iforceSlow;
00200     float energyVdw, energyElec, energySlow;
00201     int nexcluded;
00202     unsigned int itileListLen;
00203     int2 patchInd;
00204     int2 patchNumList;
00205 
00206     // Start computation
00207     {
00208       // Warp index (0...warpsize-1)
00209       const int wid = threadIdx.x % WARPSIZE;
00210 
00211       TileList tmp = tileLists[itileList];
00212       int iatomStart = tmp.iatomStart;
00213       int jtileStart = tmp.jtileStart;
00214       int jtileEnd   = tmp.jtileEnd;
00215       patchInd     = tmp.patchInd;
00216       patchNumList = tmp.patchNumList;
00217 
00218       float shx = tmp.offsetXYZ.x*lata.x + tmp.offsetXYZ.y*latb.x + tmp.offsetXYZ.z*latc.x;
00219       float shy = tmp.offsetXYZ.x*lata.y + tmp.offsetXYZ.y*latb.y + tmp.offsetXYZ.z*latc.y;
00220       float shz = tmp.offsetXYZ.x*lata.z + tmp.offsetXYZ.y*latb.z + tmp.offsetXYZ.z*latc.z;
00221 
00222       // DH - set zeroShift flag if magnitude of shift vector is zero
00223       bool zeroShift = ! (shx*shx + shy*shy + shz*shz > 0);
00224 
00225       int iatomSize, iatomFreeSize, jatomSize, jatomFreeSize;
00226       if (doPairlist) {
00227         PatchPairRecord PPStmp = patchPairs[itileList];
00228         iatomSize     = PPStmp.iatomSize;
00229         iatomFreeSize = PPStmp.iatomFreeSize;
00230         jatomSize     = PPStmp.jatomSize;
00231         jatomFreeSize = PPStmp.jatomFreeSize;
00232       }
00233 
00234       // Write to global memory here to avoid register spilling
00235       if (doVirial) {
00236         if (wid == 0) {
00237           virialEnergy[itileList].shx = shx;
00238           virialEnergy[itileList].shy = shy;
00239           virialEnergy[itileList].shz = shz;
00240         }
00241       }
00242 
00243       // Load i-atom data (and shift coordinates)
00244       float4 xyzq_i = xyzq[iatomStart + wid];
00245       xyzq_i.x += shx;
00246       xyzq_i.y += shy;
00247       xyzq_i.z += shz;
00248       int vdwtypei = vdwTypes[iatomStart + wid]*vdwCoefTableWidth;
00249 
00250       // Load i-atom data (and shift coordinates)
00251       BoundingBox boundingBoxI;
00252       if (doPairlist) {
00253         boundingBoxI = boundingBoxes[iatomStart/WARPSIZE];
00254         boundingBoxI.x += shx;
00255         boundingBoxI.y += shy;
00256         boundingBoxI.z += shz;
00257       }
00258 
00259       // Get i-atom global index
00260 #ifdef USE_NEW_EXCL_METHOD
00261       int iatomIndex, minExclAtom, maxExclAtom;
00262 #else
00263       int iatomIndex;
00264 #endif
00265       if (doPairlist) {
00266 #ifdef USE_NEW_EXCL_METHOD
00267         iatomIndex = atomIndex[iatomStart + wid];
00268         int2 tmp = minmaxExclAtom[iatomStart + wid];
00269         minExclAtom = tmp.x;
00270         maxExclAtom = tmp.y;
00271 #else
00272         iatomIndex = atomIndex[iatomStart + wid];
00273 #endif
00274       }
00275 
00276       // i-forces in registers
00277       // float3 iforce;
00278       iforce.x = 0.0f;
00279       iforce.y = 0.0f;
00280       iforce.z = 0.0f;
00281 
00282       // float3 iforceSlow;
00283       if (doSlow) {
00284         iforceSlow.x = 0.0f;
00285         iforceSlow.y = 0.0f;
00286         iforceSlow.z = 0.0f;
00287       }
00288 
00289       // float energyVdw, energyElec, energySlow;
00290       if (doEnergy) {
00291         energyVdw = 0.0f;
00292         energyElec = 0.0f;
00293         if (doSlow) energySlow = 0.0f;
00294       }
00295 
00296       // Number of exclusions
00297       // NOTE: Lowest bit is used as indicator bit for tile pairs:
00298       //       bit 0 tile has no atoms within pairlist cutoff
00299       //       bit 1 tile has atoms within pairlist cutoff
00300       // int nexcluded;
00301       if (doPairlist) nexcluded = 0;
00302 
00303       // Number of i loops and free atoms
00304       int nfreei;
00305       if (doPairlist) {
00306         int nloopi = min(iatomSize - iatomStart, WARPSIZE);
00307         nfreei = max(iatomFreeSize - iatomStart, 0);
00308         if (wid >= nloopi) {
00309           xyzq_i.x = -LARGE_FLOAT;
00310           xyzq_i.y = -LARGE_FLOAT;
00311           xyzq_i.z = -LARGE_FLOAT;
00312         }
00313       }
00314 
00315       // tile list stuff
00316       // int itileListLen;
00317       // int minJatomStart;
00318       if (doPairlist) {
00319         // minJatomStart = tileJatomStart[jtileStart];
00320         itileListLen = 0;
00321       }
00322 
00323       // Exclusion index and maxdiff
00324       int iexclIndex, iexclMaxdiff;
00325       if (doPairlist) {
00326         int2 tmp = exclIndexMaxDiff[iatomStart + wid];
00327         iexclIndex   = tmp.x;
00328         iexclMaxdiff = tmp.y;
00329       }
00330 
00331       for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) {
00332 
00333         // Load j-atom starting index and exclusion mask
00334         int jatomStart = tileJatomStart[jtile];
00335 
00336         float4 xyzq_j = xyzq[jatomStart + wid];
00337 
00338         // Check for early bail
00339         if (doPairlist) {
00340           float r2bb = distsq(boundingBoxI, xyzq_j);
00341           if (WARP_ALL(WARP_FULL_MASK, r2bb > plcutoff2)) continue;
00342         }
00343         unsigned int excl = (doPairlist) ? 0 : tileExcls[jtile].excl[wid];
00344         int vdwtypej = vdwTypes[jatomStart + wid];
00345 
00346         // Get i-atom global index
00347         int jatomIndex;
00348         if (doPairlist) {
00349           jatomIndex = atomIndex[jatomStart + wid];
00350         }
00351 
00352         // Number of j loops and free atoms
00353         int nfreej;
00354         if (doPairlist) {
00355           int nloopj = min(jatomSize - jatomStart, WARPSIZE);
00356           nfreej = max(jatomFreeSize - jatomStart, 0);
00357           //if (nfreei == 0 && nfreej == 0) continue;
00358           if (wid >= nloopj) {
00359             xyzq_j.x = LARGE_FLOAT;
00360             xyzq_j.y = LARGE_FLOAT;
00361             xyzq_j.z = LARGE_FLOAT;
00362           }
00363         }
00364 
00365         // DH - self requires that zeroShift is also set
00366         const bool self = zeroShift && (iatomStart == jatomStart);
00367         const int modval = (self) ? 2*WARPSIZE-1 : WARPSIZE-1;
00368 
00369         float3 jforce;
00370         jforce.x = 0.0f;
00371         jforce.y = 0.0f;
00372         jforce.z = 0.0f;
00373         
00374         float3 jforceSlow;
00375         if (doSlow) {
00376           jforceSlow.x = 0.0f;
00377           jforceSlow.y = 0.0f;
00378           jforceSlow.z = 0.0f;
00379         }
00380 
00381         int t = (self) ? 1 : 0;
00382 
00383         if (doPairlist) {
00384           // Build pair list
00385           // NOTE: Pairlist update, we must also include the diagonal since this is used
00386           //       in GBIS phase 2.
00387           // Clear the lowest (indicator) bit
00388           nexcluded &= (~1);
00389 
00390           // For self tiles, do the diagonal term (t=0).
00391           // NOTE: No energies are computed here, since this self-diagonal term is only for GBIS phase 2
00392           if (self) {
00393             int j = (0 + wid) & modval;
00394             // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
00395             //       However, if (j < WARPSIZE ..) below makes sure that these non-sense
00396             //       results are not actually every used
00397             float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
00398             float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
00399             float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
00400 
00401             float r2 = dx*dx + dy*dy + dz*dz;
00402 
00403             if (j < WARPSIZE && r2 < plcutoff2) {
00404               // We have atom pair within the pairlist cutoff => Set indicator bit
00405               nexcluded |= 1;
00406             }
00407             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00408           }
00409 
00410           for (;t < WARPSIZE;t++) {
00411             int j = (t + wid) & modval;
00412 
00413             // NOTE: __shfl() operation can give non-sense here because j may be >= WARPSIZE.
00414             //       However, if (j < WARPSIZE ..) below makes sure that these non-sense
00415             //       results are not used
00416             float dx = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, j, WARPSIZE) - xyzq_i.x;
00417             float dy = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, j, WARPSIZE) - xyzq_i.y;
00418             float dz = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, j, WARPSIZE) - xyzq_i.z;
00419 
00420             float r2 = dx*dx + dy*dy + dz*dz;
00421 
00422             excl >>= 1;
00423             if (j < WARPSIZE && r2 < plcutoff2) {
00424               // We have atom pair within the pairlist cutoff => Set indicator bit
00425               nexcluded |= 1;
00426               if (j < nfreej || wid < nfreei) {
00427                 bool excluded = false;
00428                 int indexdiff = jatomIndex - iatomIndex;
00429                 if ( abs(indexdiff) <= iexclMaxdiff) {
00430                   indexdiff += iexclIndex;
00431                   int indexword = ((unsigned int) indexdiff) >> 5;
00432 
00433                   if ( indexword < MAX_CONST_EXCLUSIONS ) {
00434                     indexword = constExclusions[indexword];
00435                   } else {
00436                     indexword = overflowExclusions[indexword];
00437                   }
00438 
00439                   excluded = ((indexword & (1<<(indexdiff&31))) != 0);
00440                 }
00441                 if (excluded) nexcluded += 2;
00442                 if (!excluded) excl |= 0x80000000;
00443                 if (!excluded && r2 < cutoff2) {
00444                   calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
00445                     vdwtypei, vdwtypej,
00446                     vdwCoefTable,
00447                     vdwCoefTableTex, forceTableTex, energyTableTex,
00448                     iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);
00449                 }
00450               }
00451             }
00452             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00453             shuffleNext<doSlow>(jforce, jforceSlow);
00454           } // t
00455         } else {
00456           // Just compute forces
00457           if (self) {
00458             excl >>= 1;
00459             xyzq_j.x = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00460             xyzq_j.y = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00461             xyzq_j.z = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00462             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00463           }
00464           for (;t < WARPSIZE;t++) {
00465             if ((excl & 1)) {
00466               float dx = xyzq_j.x - xyzq_i.x;
00467               float dy = xyzq_j.y - xyzq_i.y;
00468               float dz = xyzq_j.z - xyzq_i.z;
00469 
00470               float r2 = dx*dx + dy*dy + dz*dz;
00471 
00472               if (r2 < cutoff2) {
00473                 calcForceEnergy<doEnergy, doSlow>(r2, xyzq_i.w, xyzq_j.w, dx, dy, dz,
00474                   vdwtypei, vdwtypej,
00475                   vdwCoefTable,
00476                   vdwCoefTableTex, forceTableTex, energyTableTex,
00477                   iforce, iforceSlow, jforce, jforceSlow, energyVdw, energyElec, energySlow);
00478               } // (r2 < cutoff2)
00479             } // (excl & 1)
00480             excl >>= 1;
00481             xyzq_j.x = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.x, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00482             xyzq_j.y = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.y, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00483             xyzq_j.z = WARP_SHUFFLE(WARP_FULL_MASK, xyzq_j.z, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE);
00484             shuffleNext<doPairlist>(xyzq_j.w, vdwtypej, jatomIndex);
00485             shuffleNext<doSlow>(jforce, jforceSlow);
00486           } // t
00487         }
00488 
00489         // Write j-forces
00490         storeForces<doSlow>(jatomStart + wid, jforce, jforceSlow, devForces, devForcesSlow);
00491 
00492         // Write exclusions
00493         if (doPairlist && WARP_ANY(WARP_FULL_MASK, nexcluded & 1)) {
00494           int anyexcl = (65536 | WARP_ANY(WARP_FULL_MASK, excl));
00495           // Mark this jtile as non-empty:
00496           //  VdW:      1 if tile has atom pairs within pairlist cutoff and some these atoms interact
00497           //  GBIS: 65536 if tile has atom pairs within pairlist cutoff but not necessary interacting (i.e. these atoms are fixed or excluded)
00498           if (wid == 0) jtiles[jtile] = anyexcl;
00499           // Store exclusions
00500           tileExcls[jtile].excl[wid] = excl;
00501           // itileListLen:
00502           // lower 16 bits number of tiles with atom pairs within pairlist cutoff that interact
00503           // upper 16 bits number of tiles with atom pairs within pairlist cutoff (but not necessary interacting)
00504           itileListLen += anyexcl;
00505           // NOTE, this minJatomStart is only stored once for the first tile list entry
00506           // minJatomStart = min(minJatomStart, jatomStart);
00507         }
00508 
00509       } // jtile
00510 
00511       // Write i-forces
00512       storeForces<doSlow>(iatomStart + wid, iforce, iforceSlow, devForces, devForcesSlow);
00513     }
00514     // Done with computation
00515 
00516     // Save pairlist stuff
00517     if (doPairlist) {
00518 
00519       // Warp index (0...warpsize-1)
00520       const int wid = threadIdx.x % WARPSIZE;
00521 
00522       if (wid == 0) {
00523         // minJatomStart is in range [0 ... atomStorageSize-1]
00524         //int atom0 = (minJatomStart)/WARPSIZE;
00525         // int atom0 = 0;
00526         // int storageOffset = atomStorageSize/WARPSIZE;
00527         // int itileListLen = 0;
00528         // for (int jtile=jtileStart;jtile <= jtileEnd;jtile++) itileListLen += jtiles[jtile];
00529         // Store 0 if itileListLen == 0
00530         // tileListDepth[itileList] = (itileListLen > 0)*(itileListLen*storageOffset + atom0);
00531         tileListDepth[itileList] = itileListLen;
00532         tileListOrder[itileList] = itileList;
00533         // Number of active tilelists with tile with atom pairs within pairlist cutoff that interact
00534         if ((itileListLen & 65535) > 0) atomicAdd(&tileListStat->numTileLists, 1);
00535         // Number of active tilelists with tiles with atom pairs within pairlist cutoff (but not necessary interacting)
00536         if (itileListLen > 0) atomicAdd(&tileListStat->numTileListsGBIS, 1);
00537         // NOTE: always numTileListsGBIS >= numTileLists
00538       }
00539 
00540       typedef cub::WarpReduce<int> WarpReduceInt;
00541       __shared__ typename WarpReduceInt::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
00542       int warpId = threadIdx.x / WARPSIZE;
00543       // Remove indicator bit
00544       nexcluded >>= 1;
00545       volatile int nexcludedWarp = WarpReduceInt(tempStorage[warpId]).Sum(nexcluded);
00546       if (wid == 0) atomicAdd(&tileListStat->numExcluded, nexcludedWarp);
00547 
00548     }
00549 
00550     if (doVirial) {
00551       // Warp index (0...warpsize-1)
00552       const int wid = threadIdx.x % WARPSIZE;
00553 
00554       typedef cub::WarpReduce<float> WarpReduce;
00555       __shared__ typename WarpReduce::TempStorage tempStorage[NONBONDKERNEL_NUM_WARP];
00556       int warpId = threadIdx.x / WARPSIZE;
00557       volatile float iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforce.x);
00558       volatile float iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforce.y);
00559       volatile float iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforce.z);
00560       if (wid == 0) {
00561         virialEnergy[itileList].forcex = iforcexSum;
00562         virialEnergy[itileList].forcey = iforceySum;
00563         virialEnergy[itileList].forcez = iforcezSum;
00564       }
00565 
00566       if (doSlow) {
00567         iforcexSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.x);
00568         iforceySum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.y);
00569         iforcezSum = WarpReduce(tempStorage[warpId]).Sum(iforceSlow.z);
00570         if (wid == 0) {
00571           virialEnergy[itileList].forceSlowx = iforcexSum;
00572           virialEnergy[itileList].forceSlowy = iforceySum;
00573           virialEnergy[itileList].forceSlowz = iforcezSum;
00574         }
00575       }
00576     }
00577 
00578     // Reduce energy
00579     if (doEnergy) {
00580       // NOTE: We must hand write these warp-wide reductions to avoid excess register spillage
00581       //       (Why does CUB suck here?)
00582 #pragma unroll
00583       for (int i=16;i >= 1;i/=2) {
00584         energyVdw += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyVdw, i, 32);
00585         energyElec += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energyElec, i, 32);
00586         if (doSlow) energySlow += WARP_SHUFFLE_XOR(WARP_FULL_MASK, energySlow, i, 32);
00587       }
00588 
00589       if (threadIdx.x % WARPSIZE == 0) {
00590         virialEnergy[itileList].energyVdw  = energyVdw;
00591         virialEnergy[itileList].energyElec = energyElec;
00592         if (doSlow) virialEnergy[itileList].energySlow = energySlow;
00593       }
00594     }
00595 
00596     if (doStreaming) {
00597       // Make sure devForces and devForcesSlow have been written into device memory
00598       // NO NEED TO SYNCHRONIZE THREADS, THIS IS WARP-LEVEL
00599       __threadfence();
00600 
00601       int patchDone[2] = {false, false};
00602       const int wid = threadIdx.x % WARPSIZE;
00603       if (wid == 0) {
00604         int patchCountOld0 = atomicInc(&patchNumCount[patchInd.x], (unsigned int)(patchNumList.x-1));
00605         patchDone[0] = (patchCountOld0 + 1 == patchNumList.x);
00606         if (patchInd.x != patchInd.y) {
00607           int patchCountOld1 = atomicInc(&patchNumCount[patchInd.y], (unsigned int)(patchNumList.y-1));
00608           patchDone[1] = (patchCountOld1 + 1 == patchNumList.y);
00609         }
00610       }
00611 
00612       patchDone[0] = WARP_ANY(WARP_FULL_MASK, patchDone[0]);
00613       patchDone[1] = WARP_ANY(WARP_FULL_MASK, patchDone[1]);
00614 
00615       if (patchDone[0]) {
00616         // Patch 1 is done, write onto host-mapped memory
00617         CudaPatchRecord patch = cudaPatches[patchInd.x];
00618         int start = patch.atomStart;
00619         int end   = start + patch.numAtoms;
00620         for (int i=start+wid;i < end;i+=WARPSIZE) {
00621           mapForces[i] = devForces[i];
00622           if (doSlow) mapForcesSlow[i] = devForcesSlow[i];
00623         }
00624       }
00625       if (patchDone[1]) {
00626         // Patch 2 is done
00627         CudaPatchRecord patch = cudaPatches[patchInd.y];
00628         int start = patch.atomStart;
00629         int end   = start + patch.numAtoms;
00630         for (int i=start+wid;i < end;i+=WARPSIZE) {
00631           mapForces[i] = devForces[i];
00632           if (doSlow) mapForcesSlow[i] = devForcesSlow[i];
00633         }
00634       }
00635 
00636       if (patchDone[0] || patchDone[1]) {
00637         // Make sure mapForces and mapForcesSlow are up-to-date
00638         __threadfence_system();
00639         // Add patch into "patchReadyQueue"
00640         if (wid == 0) {
00641           if (patchDone[0]) {
00642             int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
00643             // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
00644             mapPatchReadyQueue[ind] = patchInd.x;
00645           }
00646           if (patchDone[1]) {
00647             int ind = atomicAdd(&tileListStat->patchReadyQueueCount, 1);
00648             // int ind = atomicInc((unsigned int *)&mapPatchReadyQueue[numPatches], numPatches-1);
00649             mapPatchReadyQueue[ind] = patchInd.y;
00650           }
00651         }
00652         // Make sure "patchReadyQueue" is visible in page-locked host memory
00653         __threadfence_system();
00654       }
00655     }
00656 
00657     if (doStreaming && outputOrder != NULL && threadIdx.x % WARPSIZE == 0) {
00658       int index = atomicAdd(&tileListStat->outputOrderIndex, 1);
00659       outputOrder[index] = itileList;
00660     }
00661   } // if (itileList < numTileLists)
00662 }
00663 
00664 //
00665 // Finish up - reduce virials from nonbonded kernel
00666 //
00667 #define REDUCENONBONDEDVIRIALKERNEL_NUM_WARP 32
00668 __global__ void reduceNonbondedVirialKernel(const bool doSlow,
00669   const int atomStorageSize,
00670   const float4* __restrict__ xyzq,
00671   const float4* __restrict__ devForces, const float4* __restrict__ devForcesSlow,
00672   VirialEnergy* __restrict__ virialEnergy) {
00673 
00674   for (int ibase = blockIdx.x*blockDim.x;ibase < atomStorageSize;ibase += blockDim.x*gridDim.x)
00675   {
00676     int i = ibase + threadIdx.x;
00677 
00678     // Set to zero to avoid nan*0
00679     float4 pos;
00680     pos.x = 0.0f;
00681     pos.y = 0.0f;
00682     pos.z = 0.0f;
00683     float4 force, forceSlow;
00684     force.x = 0.0f;
00685     force.y = 0.0f;
00686     force.z = 0.0f;
00687     forceSlow.x = 0.0f;
00688     forceSlow.y = 0.0f;
00689     forceSlow.z = 0.0f;
00690     if (i < atomStorageSize) {
00691       pos = xyzq[i];
00692       force = devForces[i];
00693       if (doSlow) forceSlow = devForcesSlow[i];
00694     }
00695     // Reduce across the entire thread block
00696     float vxxt = force.x*pos.x;
00697     float vxyt = force.x*pos.y;
00698     float vxzt = force.x*pos.z;
00699     float vyxt = force.y*pos.x;
00700     float vyyt = force.y*pos.y;
00701     float vyzt = force.y*pos.z;
00702     float vzxt = force.z*pos.x;
00703     float vzyt = force.z*pos.y;
00704     float vzzt = force.z*pos.z;
00705     // atomicAdd(&virialEnergy->virial[0], (double)vxx);
00706     // atomicAdd(&virialEnergy->virial[1], (double)vxy);
00707     // atomicAdd(&virialEnergy->virial[2], (double)vxz);
00708     // atomicAdd(&virialEnergy->virial[3], (double)vyx);
00709     // atomicAdd(&virialEnergy->virial[4], (double)vyy);
00710     // atomicAdd(&virialEnergy->virial[5], (double)vyz);
00711     // atomicAdd(&virialEnergy->virial[6], (double)vzx);
00712     // atomicAdd(&virialEnergy->virial[7], (double)vzy);
00713     // atomicAdd(&virialEnergy->virial[8], (double)vzz);
00714 
00715     typedef cub::BlockReduce<float, REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00716     __shared__ typename BlockReduce::TempStorage tempStorage;
00717     volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
00718     volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
00719     volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
00720     volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
00721     volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
00722     volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
00723     volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
00724     volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
00725     volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
00726     if (threadIdx.x == 0) {
00727       atomicAdd(&virialEnergy->virial[0], (double)vxx);
00728       atomicAdd(&virialEnergy->virial[1], (double)vxy);
00729       atomicAdd(&virialEnergy->virial[2], (double)vxz);
00730       atomicAdd(&virialEnergy->virial[3], (double)vyx);
00731       atomicAdd(&virialEnergy->virial[4], (double)vyy);
00732       atomicAdd(&virialEnergy->virial[5], (double)vyz);
00733       atomicAdd(&virialEnergy->virial[6], (double)vzx);
00734       atomicAdd(&virialEnergy->virial[7], (double)vzy);
00735       atomicAdd(&virialEnergy->virial[8], (double)vzz);
00736     }
00737 
00738     if (doSlow) {
00739       // if (isnan(forceSlow.x) || isnan(forceSlow.y) || isnan(forceSlow.z))
00740       float vxxSlowt = forceSlow.x*pos.x;
00741       float vxySlowt = forceSlow.x*pos.y;
00742       float vxzSlowt = forceSlow.x*pos.z;
00743       float vyxSlowt = forceSlow.y*pos.x;
00744       float vyySlowt = forceSlow.y*pos.y;
00745       float vyzSlowt = forceSlow.y*pos.z;
00746       float vzxSlowt = forceSlow.z*pos.x;
00747       float vzySlowt = forceSlow.z*pos.y;
00748       float vzzSlowt = forceSlow.z*pos.z;
00749       // atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
00750       // atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
00751       // atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
00752       // atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
00753       // atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
00754       // atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
00755       // atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
00756       // atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
00757       // atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
00758       volatile float vxxSlow = BlockReduce(tempStorage).Sum(vxxSlowt); BLOCK_SYNC;
00759       volatile float vxySlow = BlockReduce(tempStorage).Sum(vxySlowt); BLOCK_SYNC;
00760       volatile float vxzSlow = BlockReduce(tempStorage).Sum(vxzSlowt); BLOCK_SYNC;
00761       volatile float vyxSlow = BlockReduce(tempStorage).Sum(vyxSlowt); BLOCK_SYNC;
00762       volatile float vyySlow = BlockReduce(tempStorage).Sum(vyySlowt); BLOCK_SYNC;
00763       volatile float vyzSlow = BlockReduce(tempStorage).Sum(vyzSlowt); BLOCK_SYNC;
00764       volatile float vzxSlow = BlockReduce(tempStorage).Sum(vzxSlowt); BLOCK_SYNC;
00765       volatile float vzySlow = BlockReduce(tempStorage).Sum(vzySlowt); BLOCK_SYNC;
00766       volatile float vzzSlow = BlockReduce(tempStorage).Sum(vzzSlowt); BLOCK_SYNC;
00767       if (threadIdx.x == 0) {
00768         atomicAdd(&virialEnergy->virialSlow[0], (double)vxxSlow);
00769         atomicAdd(&virialEnergy->virialSlow[1], (double)vxySlow);
00770         atomicAdd(&virialEnergy->virialSlow[2], (double)vxzSlow);
00771         atomicAdd(&virialEnergy->virialSlow[3], (double)vyxSlow);
00772         atomicAdd(&virialEnergy->virialSlow[4], (double)vyySlow);
00773         atomicAdd(&virialEnergy->virialSlow[5], (double)vyzSlow);
00774         atomicAdd(&virialEnergy->virialSlow[6], (double)vzxSlow);
00775         atomicAdd(&virialEnergy->virialSlow[7], (double)vzySlow);
00776         atomicAdd(&virialEnergy->virialSlow[8], (double)vzzSlow);
00777       }
00778     }
00779   
00780   }
00781 
00782 }
00783 
00784 #define REDUCEVIRIALENERGYKERNEL_NUM_WARP 32
00785 __global__ void reduceVirialEnergyKernel(
00786   const bool doEnergy, const bool doVirial, const bool doSlow,
00787   const int numTileLists,
00788   const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
00789   VirialEnergy* __restrict__ virialEnergy) {
00790 
00791   for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
00792   {
00793     int itileList = ibase + threadIdx.x;
00794     TileListVirialEnergy ve;
00795     if (itileList < numTileLists) {
00796       ve = tileListVirialEnergy[itileList];
00797     } else {
00798       // Set to zero to avoid nan*0
00799       if (doVirial) {
00800         ve.shx = 0.0f;
00801         ve.shy = 0.0f;
00802         ve.shz = 0.0f;
00803         ve.forcex = 0.0f;
00804         ve.forcey = 0.0f;
00805         ve.forcez = 0.0f;
00806         ve.forceSlowx = 0.0f;
00807         ve.forceSlowy = 0.0f;
00808         ve.forceSlowz = 0.0f;
00809       }
00810       if (doEnergy) {
00811         ve.energyVdw = 0.0;
00812         ve.energyElec = 0.0;
00813         ve.energySlow = 0.0;
00814         // ve.energyGBIS = 0.0;
00815       }
00816     }
00817 
00818     if (doVirial) {
00819       typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00820       __shared__ typename BlockReduce::TempStorage tempStorage;
00821       float vxxt = ve.forcex*ve.shx;
00822       float vxyt = ve.forcex*ve.shy;
00823       float vxzt = ve.forcex*ve.shz;
00824       float vyxt = ve.forcey*ve.shx;
00825       float vyyt = ve.forcey*ve.shy;
00826       float vyzt = ve.forcey*ve.shz;
00827       float vzxt = ve.forcez*ve.shx;
00828       float vzyt = ve.forcez*ve.shy;
00829       float vzzt = ve.forcez*ve.shz;
00830       volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
00831       volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
00832       volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
00833       volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
00834       volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
00835       volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
00836       volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
00837       volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
00838       volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
00839       if (threadIdx.x == 0) {
00840         atomicAdd(&virialEnergy->virial[0], (double)vxx);
00841         atomicAdd(&virialEnergy->virial[1], (double)vxy);
00842         atomicAdd(&virialEnergy->virial[2], (double)vxz);
00843         atomicAdd(&virialEnergy->virial[3], (double)vyx);
00844         atomicAdd(&virialEnergy->virial[4], (double)vyy);
00845         atomicAdd(&virialEnergy->virial[5], (double)vyz);
00846         atomicAdd(&virialEnergy->virial[6], (double)vzx);
00847         atomicAdd(&virialEnergy->virial[7], (double)vzy);
00848         atomicAdd(&virialEnergy->virial[8], (double)vzz);
00849       }
00850 
00851       if (doSlow) {
00852         typedef cub::BlockReduce<float, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00853         __shared__ typename BlockReduce::TempStorage tempStorage;
00854         float vxxt = ve.forceSlowx*ve.shx;
00855         float vxyt = ve.forceSlowx*ve.shy;
00856         float vxzt = ve.forceSlowx*ve.shz;
00857         float vyxt = ve.forceSlowy*ve.shx;
00858         float vyyt = ve.forceSlowy*ve.shy;
00859         float vyzt = ve.forceSlowy*ve.shz;
00860         float vzxt = ve.forceSlowz*ve.shx;
00861         float vzyt = ve.forceSlowz*ve.shy;
00862         float vzzt = ve.forceSlowz*ve.shz;
00863         volatile float vxx = BlockReduce(tempStorage).Sum(vxxt); BLOCK_SYNC;
00864         volatile float vxy = BlockReduce(tempStorage).Sum(vxyt); BLOCK_SYNC;
00865         volatile float vxz = BlockReduce(tempStorage).Sum(vxzt); BLOCK_SYNC;
00866         volatile float vyx = BlockReduce(tempStorage).Sum(vyxt); BLOCK_SYNC;
00867         volatile float vyy = BlockReduce(tempStorage).Sum(vyyt); BLOCK_SYNC;
00868         volatile float vyz = BlockReduce(tempStorage).Sum(vyzt); BLOCK_SYNC;
00869         volatile float vzx = BlockReduce(tempStorage).Sum(vzxt); BLOCK_SYNC;
00870         volatile float vzy = BlockReduce(tempStorage).Sum(vzyt); BLOCK_SYNC;
00871         volatile float vzz = BlockReduce(tempStorage).Sum(vzzt); BLOCK_SYNC;
00872         if (threadIdx.x == 0) {
00873           atomicAdd(&virialEnergy->virialSlow[0], (double)vxx);
00874           atomicAdd(&virialEnergy->virialSlow[1], (double)vxy);
00875           atomicAdd(&virialEnergy->virialSlow[2], (double)vxz);
00876           atomicAdd(&virialEnergy->virialSlow[3], (double)vyx);
00877           atomicAdd(&virialEnergy->virialSlow[4], (double)vyy);
00878           atomicAdd(&virialEnergy->virialSlow[5], (double)vyz);
00879           atomicAdd(&virialEnergy->virialSlow[6], (double)vzx);
00880           atomicAdd(&virialEnergy->virialSlow[7], (double)vzy);
00881           atomicAdd(&virialEnergy->virialSlow[8], (double)vzz);
00882         }
00883       }
00884     }
00885 
00886     if (doEnergy) {
00887       typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00888       __shared__ typename BlockReduce::TempStorage tempStorage;
00889       volatile double energyVdw  = BlockReduce(tempStorage).Sum(ve.energyVdw); BLOCK_SYNC;
00890       volatile double energyElec = BlockReduce(tempStorage).Sum(ve.energyElec); BLOCK_SYNC;
00891       if (threadIdx.x == 0) {
00892           atomicAdd(&virialEnergy->energyVdw, (double)energyVdw);
00893           atomicAdd(&virialEnergy->energyElec, (double)energyElec);
00894       }
00895       if (doSlow) {
00896         volatile double energySlow = BlockReduce(tempStorage).Sum(ve.energySlow); BLOCK_SYNC;
00897         if (threadIdx.x == 0) atomicAdd(&virialEnergy->energySlow, (double)energySlow);
00898       }
00899       // if (doGBIS) {
00900       //   double energyGBIS = BlockReduce(tempStorage).Sum(ve.energyGBIS); BLOCK_SYNC;
00901       //   if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
00902       // }
00903     }
00904 
00905   }
00906 
00907 }
00908 
00909 #define REDUCEGBISENERGYKERNEL_NUM_WARP 32
00910 __global__ void reduceGBISEnergyKernel(const int numTileLists,
00911   const TileListVirialEnergy* __restrict__ tileListVirialEnergy,
00912   VirialEnergy* __restrict__ virialEnergy) {
00913 
00914   for (int ibase = blockIdx.x*blockDim.x;ibase < numTileLists;ibase += blockDim.x*gridDim.x)
00915   {
00916     int itileList = ibase + threadIdx.x;
00917     double energyGBISt = 0.0;
00918     if (itileList < numTileLists) {
00919       energyGBISt = tileListVirialEnergy[itileList].energyGBIS;
00920     }
00921 
00922     typedef cub::BlockReduce<double, REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE> BlockReduce;
00923     __shared__ typename BlockReduce::TempStorage tempStorage;
00924     volatile double energyGBIS = BlockReduce(tempStorage).Sum(energyGBISt); BLOCK_SYNC;
00925     if (threadIdx.x == 0) atomicAdd(&virialEnergy->energyGBIS, (double)energyGBIS);
00926   }
00927 
00928 }
00929 
00930 // ##############################################################################################
00931 // ##############################################################################################
00932 // ##############################################################################################
00933 
00934 CudaComputeNonbondedKernel::CudaComputeNonbondedKernel(int deviceID, CudaNonbondedTables& cudaNonbondedTables,
00935   bool doStreaming) : deviceID(deviceID), cudaNonbondedTables(cudaNonbondedTables), doStreaming(doStreaming) {
00936   
00937   cudaCheck(cudaSetDevice(deviceID));
00938 
00939   overflowExclusions = NULL;
00940   overflowExclusionsSize = 0;
00941 
00942   exclIndexMaxDiff = NULL;
00943   exclIndexMaxDiffSize = 0;
00944 
00945   atomIndex = NULL;
00946   atomIndexSize = 0;
00947 
00948   vdwTypes = NULL;
00949   vdwTypesSize = 0;
00950 
00951   patchNumCount = NULL;
00952   patchNumCountSize = 0;
00953 
00954   patchReadyQueue = NULL;
00955   patchReadyQueueSize = 0;
00956 
00957 }
00958 
00959 CudaComputeNonbondedKernel::~CudaComputeNonbondedKernel() {
00960   cudaCheck(cudaSetDevice(deviceID));
00961   if (overflowExclusions != NULL) deallocate_device<unsigned int>(&overflowExclusions);
00962   if (exclIndexMaxDiff != NULL) deallocate_device<int2>(&exclIndexMaxDiff);
00963   if (atomIndex != NULL) deallocate_device<int>(&atomIndex);
00964   if (vdwTypes != NULL) deallocate_device<int>(&vdwTypes);
00965   if (patchNumCount != NULL) deallocate_device<unsigned int>(&patchNumCount);
00966   if (patchReadyQueue != NULL) deallocate_host<int>(&patchReadyQueue);
00967 }
00968 
00969 void CudaComputeNonbondedKernel::updateVdwTypesExcl(const int atomStorageSize, const int* h_vdwTypes,
00970   const int2* h_exclIndexMaxDiff, const int* h_atomIndex, cudaStream_t stream) {
00971 
00972   reallocate_device<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, OVERALLOC);
00973   reallocate_device<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, OVERALLOC);
00974   reallocate_device<int>(&atomIndex, &atomIndexSize, atomStorageSize, OVERALLOC);
00975 
00976   copy_HtoD<int>(h_vdwTypes, vdwTypes, atomStorageSize, stream);
00977   copy_HtoD<int2>(h_exclIndexMaxDiff, exclIndexMaxDiff, atomStorageSize, stream);
00978   copy_HtoD<int>(h_atomIndex, atomIndex, atomStorageSize, stream);
00979 }
00980 
00981 int* CudaComputeNonbondedKernel::getPatchReadyQueue() {
00982   if (!doStreaming) {
00983     NAMD_die("CudaComputeNonbondedKernel::getPatchReadyQueue() called on non-streaming kernel");
00984   }
00985   return patchReadyQueue;
00986 }
00987 
00988 void CudaComputeNonbondedKernel::nonbondedForce(CudaTileListKernel& tlKernel,
00989   const int atomStorageSize, const bool doPairlist,
00990   const bool doEnergy, const bool doVirial, const bool doSlow,
00991   const float3 lata, const float3 latb, const float3 latc,
00992   const float4* h_xyzq, const float cutoff2, 
00993   float4* d_forces, float4* d_forcesSlow,
00994   float4* h_forces, float4* h_forcesSlow,
00995   cudaStream_t stream) {
00996 
00997   if (!doPairlist) copy_HtoD<float4>(h_xyzq, tlKernel.get_xyzq(), atomStorageSize, stream);
00998 
00999   clear_device_array<float4>(d_forces, atomStorageSize, stream);
01000   if (doSlow) clear_device_array<float4>(d_forcesSlow, atomStorageSize, stream);
01001 
01002   tlKernel.clearTileListStat(stream);
01003   // clear_device_array<TileListStat>(tlKernel.getTileListStatDevPtr(), 1, stream);
01004 
01005   // --- streaming ----
01006   float4* m_forces = NULL;
01007   float4* m_forcesSlow = NULL;
01008   int* m_patchReadyQueue = NULL;
01009   int numPatches = 0;
01010   unsigned int* patchNumCountPtr = NULL;
01011   if (doStreaming) {
01012     numPatches = tlKernel.getNumPatches();
01013     if (reallocate_device<unsigned int>(&patchNumCount, &patchNumCountSize, numPatches)) {
01014       // If re-allocated, clear array
01015       clear_device_array<unsigned int>(patchNumCount, numPatches, stream);
01016     }
01017     patchNumCountPtr = patchNumCount;
01018     bool re = reallocate_host<int>(&patchReadyQueue, &patchReadyQueueSize, numPatches, cudaHostAllocMapped);
01019     if (re) {
01020       // If re-allocated, re-set to "-1"
01021       for (int i=0;i < numPatches;i++) patchReadyQueue[i] = -1;
01022     }
01023     cudaCheck(cudaHostGetDevicePointer(&m_patchReadyQueue, patchReadyQueue, 0));
01024     cudaCheck(cudaHostGetDevicePointer(&m_forces, h_forces, 0));
01025     cudaCheck(cudaHostGetDevicePointer(&m_forcesSlow, h_forcesSlow, 0));
01026   }
01027   // -----------------
01028 
01029   if (doVirial || doEnergy) {
01030     tlKernel.setTileListVirialEnergyLength(tlKernel.getNumTileLists());
01031   }
01032 
01033   int shMemSize = 0;
01034 
01035   int* outputOrderPtr = tlKernel.getOutputOrder();
01036 
01037   int nwarp = NONBONDKERNEL_NUM_WARP;
01038   int nthread = WARPSIZE*nwarp;
01039   int start = 0;
01040   while (start < tlKernel.getNumTileLists())
01041   {
01042 
01043     int nleft = tlKernel.getNumTileLists() - start;
01044     int nblock = min(deviceCUDA->getMaxNumBlocks(), (nleft-1)/nwarp+1);
01045 
01046 #define CALL(DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING) \
01047     nonbondedForceKernel<DOENERGY, DOVIRIAL, DOSLOW, DOPAIRLIST, DOSTREAMING> \
01048   <<< nblock, nthread, shMemSize, stream >>>  \
01049   (start, tlKernel.getNumTileLists(), tlKernel.getTileLists(), tlKernel.getTileExcls(), tlKernel.getTileJatomStart(), \
01050     cudaNonbondedTables.getVdwCoefTableWidth(), cudaNonbondedTables.getVdwCoefTable(), \
01051     vdwTypes, lata, latb, latc, tlKernel.get_xyzq(), cutoff2, \
01052     cudaNonbondedTables.getVdwCoefTableTex(), cudaNonbondedTables.getForceTableTex(), cudaNonbondedTables.getEnergyTableTex(), \
01053     atomStorageSize, tlKernel.get_plcutoff2(), tlKernel.getPatchPairs(), atomIndex, exclIndexMaxDiff, overflowExclusions, \
01054     tlKernel.getTileListDepth(), tlKernel.getTileListOrder(), tlKernel.getJtiles(), tlKernel.getTileListStatDevPtr(), \
01055     tlKernel.getBoundingBoxes(), d_forces, d_forcesSlow, \
01056     numPatches, patchNumCountPtr, tlKernel.getCudaPatches(), m_forces, m_forcesSlow, m_patchReadyQueue, \
01057     outputOrderPtr, tlKernel.getTileListVirialEnergy()); called=true
01058 
01059     bool called = false;
01060 
01061     if (doStreaming) {
01062       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 1);
01063       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 1);
01064       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 1);
01065       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 1);
01066       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 1);
01067       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 1);
01068       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 1);
01069       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 1);
01070 
01071       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 1);
01072       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 1);
01073       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 1);
01074       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 1);
01075       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 1);
01076       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 1);
01077       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 1);
01078       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 1);
01079     } else {
01080       if (!doEnergy && !doVirial && !doSlow && !doPairlist) CALL(0, 0, 0, 0, 0);
01081       if (!doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(0, 0, 1, 0, 0);
01082       if (!doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(0, 1, 0, 0, 0);
01083       if (!doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(0, 1, 1, 0, 0);
01084       if ( doEnergy && !doVirial && !doSlow && !doPairlist) CALL(1, 0, 0, 0, 0);
01085       if ( doEnergy && !doVirial &&  doSlow && !doPairlist) CALL(1, 0, 1, 0, 0);
01086       if ( doEnergy &&  doVirial && !doSlow && !doPairlist) CALL(1, 1, 0, 0, 0);
01087       if ( doEnergy &&  doVirial &&  doSlow && !doPairlist) CALL(1, 1, 1, 0, 0);
01088 
01089       if (!doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(0, 0, 0, 1, 0);
01090       if (!doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(0, 0, 1, 1, 0);
01091       if (!doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(0, 1, 0, 1, 0);
01092       if (!doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(0, 1, 1, 1, 0);
01093       if ( doEnergy && !doVirial && !doSlow &&  doPairlist) CALL(1, 0, 0, 1, 0);
01094       if ( doEnergy && !doVirial &&  doSlow &&  doPairlist) CALL(1, 0, 1, 1, 0);
01095       if ( doEnergy &&  doVirial && !doSlow &&  doPairlist) CALL(1, 1, 0, 1, 0);
01096       if ( doEnergy &&  doVirial &&  doSlow &&  doPairlist) CALL(1, 1, 1, 1, 0);
01097     }
01098 
01099     if (!called) {
01100       NAMD_die("CudaComputeNonbondedKernel::nonbondedForce, none of the kernels called");
01101     }
01102 
01103 #undef CALL
01104     cudaCheck(cudaGetLastError());
01105 
01106     start += nblock*nwarp;
01107   }
01108 
01109 }
01110 
01111 //
01112 // Perform virial and energy reductions for non-bonded force calculation
01113 //
01114 void CudaComputeNonbondedKernel::reduceVirialEnergy(CudaTileListKernel& tlKernel,
01115   const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS,
01116   float4* d_forces, float4* d_forcesSlow,
01117   VirialEnergy* d_virialEnergy, cudaStream_t stream) {
01118 
01119   if (doEnergy || doVirial) {
01120     clear_device_array<VirialEnergy>(d_virialEnergy, 1, stream);
01121   }
01122 
01123   if (doVirial)
01124   {
01125     int nthread = REDUCENONBONDEDVIRIALKERNEL_NUM_WARP*WARPSIZE;
01126     int nblock = min(deviceCUDA->getMaxNumBlocks(), (atomStorageSize-1)/nthread+1);
01127     reduceNonbondedVirialKernel <<< nblock, nthread, 0, stream >>>
01128     (doSlow, atomStorageSize, tlKernel.get_xyzq(), d_forces, d_forcesSlow, d_virialEnergy);
01129     cudaCheck(cudaGetLastError());
01130   }
01131 
01132   if (doVirial || doEnergy)
01133   {
01134     int nthread = REDUCEVIRIALENERGYKERNEL_NUM_WARP*WARPSIZE;
01135     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyLength()-1)/nthread+1);
01136     reduceVirialEnergyKernel <<< nblock, nthread, 0, stream >>>
01137     (doEnergy, doVirial, doSlow, tlKernel.getTileListVirialEnergyLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01138     cudaCheck(cudaGetLastError());
01139   }  
01140 
01141   if (doGBIS && doEnergy)
01142   {
01143     int nthread = REDUCEGBISENERGYKERNEL_NUM_WARP*WARPSIZE;
01144     int nblock = min(deviceCUDA->getMaxNumBlocks(), (tlKernel.getTileListVirialEnergyGBISLength()-1)/nthread+1);
01145     reduceGBISEnergyKernel <<< nblock, nthread, 0, stream >>>
01146     (tlKernel.getTileListVirialEnergyGBISLength(), tlKernel.getTileListVirialEnergy(), d_virialEnergy);
01147     cudaCheck(cudaGetLastError());
01148   }
01149 
01150 }
01151 
01152 void CudaComputeNonbondedKernel::bindExclusions(int numExclusions, unsigned int* exclusion_bits) {
01153         int nconst = ( numExclusions < MAX_CONST_EXCLUSIONS ? numExclusions : MAX_CONST_EXCLUSIONS );
01154         cudaCheck(cudaMemcpyToSymbol(constExclusions, exclusion_bits, nconst*sizeof(unsigned int), 0));
01155 
01156   reallocate_device<unsigned int>(&overflowExclusions, &overflowExclusionsSize, numExclusions);
01157   copy_HtoD_sync<unsigned int>(exclusion_bits, overflowExclusions, numExclusions);
01158 }

Generated on Fri Sep 22 01:17:12 2017 for NAMD by  doxygen 1.4.7