ComputeNonbondedCUDAKernelBase.h

Go to the documentation of this file.
00001 
00002 #ifdef NAMD_CUDA
00003 
00004 #define NAME(X) SLOWNAME( X )
00005 
00006 #undef SLOW
00007 #undef SLOWNAME
00008 #ifdef DO_SLOW
00009 #define SLOW(X) X
00010 #define SLOWNAME(X) ENERGYNAME( X ## _slow )
00011 #else
00012 #define SLOW(X)
00013 #define SLOWNAME(X) ENERGYNAME( X )
00014 #endif
00015 
00016 #undef ENERGY
00017 #undef ENERGYNAME
00018 #ifdef DO_ENERGY
00019 #define ENERGY(X) X
00020 #define ENERGYNAME(X) PAIRLISTNAME( X ## _energy )
00021 #else
00022 #define ENERGY(X)
00023 #define ENERGYNAME(X) PAIRLISTNAME( X )
00024 #endif
00025 
00026 #undef GENPAIRLIST
00027 #undef USEPAIRLIST
00028 #undef PAIRLISTNAME
00029 #ifdef MAKE_PAIRLIST
00030 #define GENPAIRLIST(X) X
00031 #define USEPAIRLIST(X)
00032 #define PAIRLISTNAME(X) LAST( X ## _pairlist )
00033 #else
00034 #define GENPAIRLIST(X)
00035 #define USEPAIRLIST(X) X
00036 #define PAIRLISTNAME(X) LAST( X )
00037 #endif
00038 
00039 #define LAST(X) X
00040 
00041 #undef KEPLER_SHUFFLE
00042 #ifdef __CUDA_ARCH__
00043 #define KEPLER_SHUFFLE
00044 #if __CUDA_ARCH__ < 300
00045 #undef KEPLER_SHUFFLE
00046 #endif
00047 #endif
00048 
00049 #undef REG_JFORCE
00050 #ifdef KEPLER_SHUFFLE
00051 #ifndef MAKE_PAIRLIST
00052 #define REG_JFORCE
00053 #endif
00054 #endif
00055 
00056 #ifdef KEPLER_SHUFFLE
00057 __device__ __forceinline__ static void NAME(shfl_reduction)(
00058                 float *reg,
00059                 float *smem
00060                 )
00061 {
00062   *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 16, 32);
00063   *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 8, 32);
00064   *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 4, 32);
00065   *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 2, 32);
00066   *reg += WARP_SHUFFLE_XOR(WARP_FULL_MASK, *reg, 1, 32);
00067 
00068   if ( threadIdx.x % 32 == 0 ) {
00069       atomicAdd(smem,*reg);
00070   }
00071   return;
00072 }
00073 #endif /* KEPLER_SHUFFLE */
00074 
00075 __device__ __forceinline__ 
00076 static void NAME(finish_forces_virials)(const int start, const int size, const int patch_ind,
00077                                         const atom* atoms,
00078                                         volatile float* sh_buf,
00079 #ifndef KEPLER_SHUFFLE
00080                                         volatile float* sh_slow_buf, volatile float* sh_vcc,
00081 #endif
00082                                         float4* tmpforces, float4* slow_tmpforces,
00083                                         float4* forces, float4* slow_forces,
00084                                         float* tmpvirials, float* slow_tmpvirials, 
00085                                         float* virials, float* slow_virials);
00086 
00087 //
00088 // Reduces up to three variables into global memory locations dst[0], dst[1], dst[2]
00089 //
00090 template<typename T, int n, int sh_buf_size>
00091 __device__ __forceinline__
00092 void NAME(reduceVariables)(volatile T* sh_buf, T* dst, T val1, T val2, T val3) {
00093         // Sanity check
00094         cuda_static_assert(n > 0 && n <= NUM_WARP);
00095 #ifdef KEPLER_SHUFFLE
00096   // Requires NUM_WARP*n*sizeof(float) shared memory
00097   cuda_static_assert(sh_buf_size >= NUM_WARP*n*sizeof(T));
00098   // Reduce within warp
00099   for (int i=WARPSIZE/2;i >= 1;i/=2) {
00100     if (n >= 1) val1 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val1, i, WARPSIZE);
00101     if (n >= 2) val2 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val2, i, WARPSIZE);
00102     if (n >= 3) val3 += WARP_SHUFFLE_XOR(WARP_FULL_MASK, val3, i, WARPSIZE);
00103   }
00104   if (threadIdx.x == 0) {
00105     if (n >= 1) sh_buf[threadIdx.y*n + 0] = val1;
00106     if (n >= 2) sh_buf[threadIdx.y*n + 1] = val2;
00107     if (n >= 3) sh_buf[threadIdx.y*n + 2] = val3;
00108   }
00109   BLOCK_SYNC;
00110   if (threadIdx.x < n && threadIdx.y == 0) {
00111     T finalval = (T)0;
00112 #pragma unroll
00113     for (int i=0;i < NUM_WARP;++i) {
00114       finalval += sh_buf[i*n + threadIdx.x];
00115     }
00116     atomicAdd(&dst[threadIdx.x], finalval);
00117   }
00118 #else // ! KEPLER_SHUFFLE
00119   // Requires NUM_WARP*n*WARPSIZE*sizeof(float) shared memory
00120   cuda_static_assert(sh_buf_size >= NUM_WARP*n*WARPSIZE*sizeof(T));
00121   volatile T* sh_bufy = &sh_buf[threadIdx.y*n*WARPSIZE];
00122   if (n >= 1) sh_bufy[threadIdx.x*n + 0] = val1;
00123   if (n >= 2) sh_bufy[threadIdx.x*n + 1] = val2;
00124   if (n >= 3) sh_bufy[threadIdx.x*n + 2] = val3;
00125   // Reducue within warp
00126   for (int d=1;d < WARPSIZE;d*=2) {
00127     int pos = threadIdx.x + d;
00128     T val1t, val2t, val3t;
00129     if (n >= 1) val1t = (pos < WARPSIZE) ? sh_bufy[pos*n + 0] : (T)0;
00130     if (n >= 2) val2t = (pos < WARPSIZE) ? sh_bufy[pos*n + 1] : (T)0;
00131     if (n >= 3) val3t = (pos < WARPSIZE) ? sh_bufy[pos*n + 2] : (T)0;
00132     if (n >= 1) sh_bufy[threadIdx.x*n + 0] += val1t;
00133     if (n >= 2) sh_bufy[threadIdx.x*n + 1] += val2t;
00134     if (n >= 3) sh_bufy[threadIdx.x*n + 2] += val3t;
00135   }
00136   BLOCK_SYNC;
00137   if (threadIdx.x < n && threadIdx.y == 0) {
00138     T finalval = (T)0;
00139 #pragma unroll
00140     for (int i=0;i < NUM_WARP;++i) {
00141       finalval += sh_buf[i*n*WARPSIZE + threadIdx.x];
00142     }
00143     atomicAdd(&dst[threadIdx.x], finalval);
00144   }
00145 #endif // KEPLER_SHUFFLE
00146 }
00147 
00148 //
00149 // Called with 2d thread setting:
00150 // x-threadblock = warpSize = 32
00151 // y-threadblock = NUM_WARP = 4
00152 //
00153 __global__ static void
00154 GENPAIRLIST(__launch_bounds__(NUM_WARP*WARPSIZE, 10) )
00155 USEPAIRLIST(__launch_bounds__(NUM_WARP*WARPSIZE, 12) )
00156 NAME(dev_nonbonded)
00157      (const patch_pair* patch_pairs,
00158       const atom *atoms, const atom_param *atom_params,
00159       const int* vdw_types, unsigned int* plist,
00160       float4 *tmpforces, float4 *slow_tmpforces,
00161       float4 *forces, float4 *slow_forces,
00162       float* tmpvirials, float* slow_tmpvirials,
00163       float* virials, float* slow_virials,
00164       unsigned int* global_counters, int* force_ready_queue,
00165       const unsigned int *overflow_exclusions,
00166       const int npatches,
00167       const int block_begin, const int total_block_count, int* block_order,
00168       exclmask* exclmasks, const int lj_table_size,
00169       const float3 lata, const float3 latb, const float3 latc,
00170       const float cutoff2, const float plcutoff2, const int doSlow) {
00171 
00172   // Local structure definitions
00173   GENPAIRLIST(struct vdw_index {
00174     int vdw_type;
00175     int index;
00176   };)
00177 
00178   // Shared memory
00179   __shared__ patch_pair sh_patch_pair;
00180 #ifndef REG_JFORCE
00181   __shared__ float3 sh_jforce_2d[NUM_WARP][WARPSIZE];
00182   SLOW(__shared__ float3 sh_jforce_slow_2d[NUM_WARP][WARPSIZE];)
00183 #endif
00184 #ifndef KEPLER_SHUFFLE
00185   __shared__ atom sh_jpq_2d[NUM_WARP][WARPSIZE];
00186 #endif
00187   __shared__ float3 sh_iforcesum[SLOW(NUM_WARP+) NUM_WARP];
00188 
00189   ENERGY(
00190          float totalev = 0.f;
00191          float totalee = 0.f;
00192          SLOW( float totales = 0.f; )
00193         )
00194 
00195   GENPAIRLIST(int nexcluded=0;);
00196 
00197   {
00198 #ifndef KEPLER_SHUFFLE
00199   GENPAIRLIST(__shared__ atom_param sh_jap_2d[NUM_WARP][WARPSIZE];)
00200   USEPAIRLIST(__shared__ int sh_jap_vdw_type_2d[NUM_WARP][WARPSIZE];)
00201 #endif
00202   USEPAIRLIST(__shared__ int sh_plist_ind[NUM_WARP];
00203               __shared__ unsigned int sh_plist_val[NUM_WARP];);
00204 
00205   // Load patch_pair -data into shared memory
00206   {
00207     const int t = threadIdx.x + threadIdx.y*WARPSIZE;
00208 
00209     if (t < 3*(SLOW(NUM_WARP+) NUM_WARP)) {
00210       float *p = (float *)sh_iforcesum;
00211       p[threadIdx.x] = 0.0f;
00212     }
00213 
00214     if (t < PATCH_PAIR_SIZE) {
00215       int* src = (int *)&patch_pairs[block_begin + blockIdx.x];
00216       int* dst = (int *)&sh_patch_pair;
00217       dst[t] = src[t];
00218     }
00219     // Need to sync here to make sure sh_patch_pair is ready
00220     BLOCK_SYNC;
00221 
00222     // Initialize pairlist index to impossible value
00223     USEPAIRLIST(if (threadIdx.x == 0) sh_plist_ind[threadIdx.y] = -1;);
00224 
00225     // Initialize pair list to "no interactions"
00226     GENPAIRLIST({
00227         if (t < sh_patch_pair.plist_size)
00228           plist[sh_patch_pair.plist_start + t] = 0;
00229       })
00230 
00231     // convert scaled offset with current lattice and write into shared memory
00232     if (t == 0) {
00233       float offx = sh_patch_pair.offset.x * lata.x
00234         + sh_patch_pair.offset.y * latb.x
00235         + sh_patch_pair.offset.z * latc.x;
00236       float offy = sh_patch_pair.offset.x * lata.y
00237         + sh_patch_pair.offset.y * latb.y
00238         + sh_patch_pair.offset.z * latc.y;
00239       float offz = sh_patch_pair.offset.x * lata.z
00240         + sh_patch_pair.offset.y * latb.z
00241         + sh_patch_pair.offset.z * latc.z;
00242       sh_patch_pair.offset.x = offx;
00243       sh_patch_pair.offset.y = offy;
00244       sh_patch_pair.offset.z = offz;
00245     }
00246 
00247     BLOCK_SYNC;
00248   }
00249 
00250   // Compute pointers to shared memory to avoid point computation later on
00251 #ifndef REG_JFORCE
00252   volatile float3* sh_jforce      = &sh_jforce_2d[threadIdx.y][0];
00253   SLOW(volatile float3* sh_jforce_slow = &sh_jforce_slow_2d[threadIdx.y][0];)
00254 #endif
00255 
00256 #ifndef KEPLER_SHUFFLE
00257   atom* sh_jpq       = &sh_jpq_2d[threadIdx.y][0];
00258   GENPAIRLIST(atom_param* sh_jap = &sh_jap_2d[threadIdx.y][0];);
00259   USEPAIRLIST(int* sh_jap_vdw_type = &sh_jap_vdw_type_2d[threadIdx.y][0];);
00260 #endif
00261 
00262   for (int blocki = threadIdx.y*WARPSIZE;blocki < sh_patch_pair.patch1_size;blocki += WARPSIZE*NUM_WARP) {
00263 
00264     atom ipq;
00265     GENPAIRLIST(vdw_index iap;);
00266     USEPAIRLIST(int iap_vdw_type;);
00267     // Load i atom data
00268     if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
00269       int i = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00270       float4 tmpa = ((float4*)atoms)[i];
00271       ipq.position.x = tmpa.x + sh_patch_pair.offset.x;
00272       ipq.position.y = tmpa.y + sh_patch_pair.offset.y;
00273       ipq.position.z = tmpa.z + sh_patch_pair.offset.z;
00274       ipq.charge = tmpa.w;
00275       GENPAIRLIST(uint4 tmpap = ((uint4*)atom_params)[i];
00276                   iap.vdw_type = tmpap.x*lj_table_size;
00277                   iap.index = tmpap.y;);
00278       USEPAIRLIST(iap_vdw_type = vdw_types[i]*lj_table_size;);
00279     }
00280 
00281     // i-forces in registers
00282     float3 iforce;
00283     iforce.x = 0.0f;
00284     iforce.y = 0.0f;
00285     iforce.z = 0.0f;
00286     SLOW(float3 iforce_slow;
00287          iforce_slow.x = 0.0f;
00288          iforce_slow.y = 0.0f;
00289          iforce_slow.z = 0.0f;)
00290 
00291                 const bool diag_patch_pair = (sh_patch_pair.patch1_start == sh_patch_pair.patch2_start) && 
00292                 (sh_patch_pair.offset.x == 0.0f && sh_patch_pair.offset.y == 0.0f && sh_patch_pair.offset.z == 0.0f);
00293     int blockj = (diag_patch_pair) ? blocki : 0;
00294     for (;blockj < sh_patch_pair.patch2_size;blockj += WARPSIZE) {
00295 
00296       USEPAIRLIST({
00297           const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
00298           int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
00299           int plist_ind = pos/32;
00300           unsigned int plist_bit = 1 << (pos % 32);
00301           // Check if we need to load next entry in the pairlist
00302           if (plist_ind != sh_plist_ind[threadIdx.y]) {
00303                 sh_plist_val[threadIdx.y] = plist[sh_patch_pair.plist_start + plist_ind];
00304                 sh_plist_ind[threadIdx.y] = plist_ind;
00305           }
00306           if ((sh_plist_val[threadIdx.y] & plist_bit) == 0) continue;
00307         })
00308 
00309       // Load j atom data
00310 #ifdef KEPLER_SHUFFLE
00311       atom jpq;
00312       GENPAIRLIST(atom_param jap;);
00313       USEPAIRLIST(int jap_vdw_type;);
00314 #endif
00315 
00316       GENPAIRLIST(
00317                 // Avoid calculating pairs of blocks where all atoms on both blocks are fixed
00318                 if (blocki >= sh_patch_pair.patch1_free_size && blockj >= sh_patch_pair.patch2_free_size) continue;
00319                   int nfreej = sh_patch_pair.patch2_free_size - blockj;
00320                   int nloopj = min(sh_patch_pair.patch2_size - blockj, WARPSIZE);
00321                   );
00322 
00323       //GENPAIRLIST(bool inside_plcutoff = false;)
00324       if (blockj + threadIdx.x < sh_patch_pair.patch2_size) {
00325         int j = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00326         float4 tmpa = ((float4*)atoms)[j];
00327 #ifdef KEPLER_SHUFFLE
00328         jpq.position.x = tmpa.x;
00329         jpq.position.y = tmpa.y;
00330         jpq.position.z = tmpa.z;
00331         jpq.charge = tmpa.w;
00332 #else
00333         sh_jpq[threadIdx.x].position.x = tmpa.x;
00334         sh_jpq[threadIdx.x].position.y = tmpa.y;
00335         sh_jpq[threadIdx.x].position.z = tmpa.z;
00336         sh_jpq[threadIdx.x].charge = tmpa.w;
00337 #endif
00338 
00339 #ifdef KEPLER_SHUFFLE
00340         GENPAIRLIST(jap = atom_params[j];)
00341         USEPAIRLIST(jap_vdw_type = vdw_types[j];)
00342 #else
00343         GENPAIRLIST(sh_jap[threadIdx.x] = atom_params[j];)
00344         USEPAIRLIST(sh_jap_vdw_type[threadIdx.x] = vdw_types[j];)
00345 #endif
00346       }
00347 
00348       // j-forces in shared memory
00349 #ifdef REG_JFORCE
00350       float3 jforce;
00351       jforce.x = 0.0f;
00352       jforce.y = 0.0f;
00353       jforce.z = 0.0f;
00354       SLOW(float3 jforce_slow;
00355            jforce_slow.x = 0.0f;
00356            jforce_slow.y = 0.0f;
00357            jforce_slow.z = 0.0f;
00358            );
00359 #else
00360       sh_jforce[threadIdx.x].x = 0.0f;
00361       sh_jforce[threadIdx.x].y = 0.0f;
00362       sh_jforce[threadIdx.x].z = 0.0f;
00363       SLOW(sh_jforce_slow[threadIdx.x].x = 0.0f;
00364            sh_jforce_slow[threadIdx.x].y = 0.0f;
00365            sh_jforce_slow[threadIdx.x].z = 0.0f;)
00366 #endif
00367 
00368       GENPAIRLIST(unsigned int excl = 0;)
00369       USEPAIRLIST(
00370                   const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
00371                   const int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
00372                   unsigned int excl = exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x];
00373                   );
00374       GENPAIRLIST(
00375                   int nloopi = sh_patch_pair.patch1_size - blocki;
00376                   if (nloopi > WARPSIZE) nloopi = WARPSIZE;
00377                   // NOTE: We must truncate nfreei to be non-negative number since we're comparing to threadIdx.x (unsigned int) later on
00378                   int nfreei = max(sh_patch_pair.patch1_free_size - blocki, 0);
00379                   )
00380       const bool diag_tile = diag_patch_pair && (blocki == blockj);
00381       // Loop through tile diagonals. Local tile indices are:
00382       // i = threadIdx.x % WARPSIZE = constant
00383       // j = (t + threadIdx.x) % WARPSIZE
00384       const int modval = (diag_tile) ? 2*WARPSIZE-1 : WARPSIZE-1;
00385       int t = (diag_tile) ? 1 : 0;
00386       if (diag_tile) {
00387         USEPAIRLIST(excl >>= 1;);
00388 #ifdef KEPLER_SHUFFLE
00389         jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00390         USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
00391         GENPAIRLIST(jap.vdw_type     = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00392                     jap.index        = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00393                     jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00394                     jap.excl_index   = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00395                     );
00396 #endif
00397       }
00398 
00399       for (; t < WARPSIZE; ++t) {
00400         USEPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl & 1)))
00401         {
00402         GENPAIRLIST(excl >>= 1;);
00403         int j = (t + threadIdx.x) & modval;
00404 #ifdef KEPLER_SHUFFLE
00405         float tmpx = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.x, j, WARPSIZE) - ipq.position.x;
00406         float tmpy = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.y, j, WARPSIZE) - ipq.position.y;
00407         float tmpz = WARP_SHUFFLE(WARP_FULL_MASK, jpq.position.z, j, WARPSIZE) - ipq.position.z;
00408         GENPAIRLIST(
00409                     int j_vdw_type     = jap.vdw_type;
00410                     int j_index        = jap.index;
00411                     int j_excl_maxdiff = jap.excl_maxdiff;
00412                     int j_excl_index   = jap.excl_index;
00413                     );
00414         float j_charge = jpq.charge;
00415         USEPAIRLIST(
00416                     int j_vdw_type = jap_vdw_type;
00417                     );
00418 #endif
00419         GENPAIRLIST(if (j < nloopj && threadIdx.x < nloopi && (j < nfreej || threadIdx.x < nfreei) ))
00420           {
00421 
00422 #ifndef KEPLER_SHUFFLE
00423           float tmpx = sh_jpq[j].position.x - ipq.position.x;
00424           float tmpy = sh_jpq[j].position.y - ipq.position.y;
00425           float tmpz = sh_jpq[j].position.z - ipq.position.z;
00426           GENPAIRLIST(
00427                       int j_vdw_type     = sh_jap[j].vdw_type;
00428                       int j_index        = sh_jap[j].index;
00429                       int j_excl_maxdiff = sh_jap[j].excl_maxdiff;
00430                       int j_excl_index   = sh_jap[j].excl_index;
00431                       );
00432           float j_charge = sh_jpq[j].charge;
00433           USEPAIRLIST(
00434                       int j_vdw_type = sh_jap_vdw_type[j];
00435                       );
00436 #endif
00437           float r2 = tmpx*tmpx + tmpy*tmpy + tmpz*tmpz;
00438           GENPAIRLIST(if (r2 < plcutoff2))
00439           USEPAIRLIST(if ((excl & 1) && r2 < cutoff2))
00440             {
00441             GENPAIRLIST(
00442                         bool excluded = false;
00443                         int indexdiff = (int)(iap.index) - j_index;
00444                         if ( abs(indexdiff) <= j_excl_maxdiff) {
00445                             indexdiff += j_excl_index;
00446                             int indexword = ((unsigned int) indexdiff) >> 5;
00447                             //indexword = tex1Dfetch(tex_exclusions, indexword);
00448                             if ( indexword < MAX_CONST_EXCLUSIONS )
00449                               indexword = const_exclusions[indexword];
00450                             else {
00451                               indexword = overflow_exclusions[indexword];
00452                             }
00453                             excluded = ((indexword & (1<<(indexdiff&31))) != 0);
00454                             if (excluded) nexcluded++;
00455                         }
00456                         if (!excluded) excl |= 0x80000000;
00457                         )
00458             GENPAIRLIST(if ( ! excluded && r2 < cutoff2))
00459             {
00460             ENERGY( float rsqrtfr2; );
00461             float4 fi = tex1D(force_table, ENERGY(rsqrtfr2 =) rsqrtf(r2));
00462             ENERGY( float4 ei = tex1D(energy_table, rsqrtfr2); );
00463             GENPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap.vdw_type););
00464             USEPAIRLIST(float2 ljab = tex1Dfetch(lj_table, j_vdw_type + iap_vdw_type););
00465 
00466               float f_slow = ipq.charge * j_charge;
00467               float f = ljab.x * fi.z + ljab.y * fi.y + f_slow * fi.x;
00468               ENERGY(
00469                      float ev = ljab.x * ei.z + ljab.y * ei.y;
00470                      float ee = f_slow * ei.x;
00471                      SLOW( float es = f_slow * ei.w; )
00472                      )
00473               SLOW( f_slow *= fi.w; )
00474               ENERGY(
00475                      totalev += ev;
00476                      totalee += ee;
00477                      SLOW( totales += es; )
00478                      )
00479               float fx = tmpx * f;
00480               float fy = tmpy * f;
00481               float fz = tmpz * f;
00482               iforce.x += fx;
00483               iforce.y += fy;
00484               iforce.z += fz;
00485 #ifdef REG_JFORCE
00486               jforce.x -= fx;
00487               jforce.y -= fy;
00488               jforce.z -= fz;
00489 #else
00490               sh_jforce[j].x -= fx;
00491               sh_jforce[j].y -= fy;
00492               sh_jforce[j].z -= fz;
00493 #endif
00494               SLOW(
00495                    float fx_slow = tmpx * f_slow;
00496                    float fy_slow = tmpy * f_slow;
00497                    float fz_slow = tmpz * f_slow;
00498                    iforce_slow.x += fx_slow;
00499                    iforce_slow.y += fy_slow;
00500                    iforce_slow.z += fz_slow;
00501                    )
00502 #ifdef REG_JFORCE
00503               SLOW(
00504                    jforce_slow.x -= fx_slow;
00505                    jforce_slow.y -= fy_slow;
00506                    jforce_slow.z -= fz_slow;
00507                    )
00508 #else
00509               SLOW(
00510                    sh_jforce_slow[j].x -= fx_slow;
00511                    sh_jforce_slow[j].y -= fy_slow;
00512                    sh_jforce_slow[j].z -= fz_slow;
00513                    )
00514 #endif
00515             }
00516             } // cutoff
00517         } // if (j < nloopj...)
00518 }
00519         USEPAIRLIST(excl >>= 1;);
00520 #ifdef KEPLER_SHUFFLE
00521         jpq.charge = WARP_SHUFFLE(WARP_FULL_MASK, jpq.charge, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00522         USEPAIRLIST(jap_vdw_type = WARP_SHUFFLE(WARP_FULL_MASK, jap_vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE ););
00523         GENPAIRLIST(jap.vdw_type     = WARP_SHUFFLE(WARP_FULL_MASK, jap.vdw_type, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00524                     jap.index        = WARP_SHUFFLE(WARP_FULL_MASK, jap.index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00525                     jap.excl_maxdiff = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_maxdiff, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00526                     jap.excl_index   = WARP_SHUFFLE(WARP_FULL_MASK, jap.excl_index, (threadIdx.x+1) & (WARPSIZE-1), WARPSIZE );
00527                     );
00528 #ifdef REG_JFORCE
00529         jforce.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00530         jforce.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00531         jforce.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00532         SLOW(
00533              jforce_slow.x = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.x, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00534              jforce_slow.y = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.y, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00535              jforce_slow.z = WARP_SHUFFLE(WARP_FULL_MASK, jforce_slow.z, (threadIdx.x+1)&(WARPSIZE-1), WARPSIZE);
00536              );
00537 #endif
00538 #endif
00539       } // t
00540 
00541       // Write j-forces
00542       GENPAIRLIST(if (WARP_ANY(WARP_FULL_MASK, excl != 0))) {
00543         if ( blockj + threadIdx.x < sh_patch_pair.patch2_size ) {
00544           int jforce_pos = sh_patch_pair.patch2_start + blockj + threadIdx.x;
00545 #ifdef REG_JFORCE
00546           atomicAdd(&tmpforces[jforce_pos].x, jforce.x);
00547           atomicAdd(&tmpforces[jforce_pos].y, jforce.y);
00548           atomicAdd(&tmpforces[jforce_pos].z, jforce.z);
00549           SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, jforce_slow.x);
00550                atomicAdd(&slow_tmpforces[jforce_pos].y, jforce_slow.y);
00551                atomicAdd(&slow_tmpforces[jforce_pos].z, jforce_slow.z););
00552 #else
00553           atomicAdd(&tmpforces[jforce_pos].x, sh_jforce[threadIdx.x].x);
00554           atomicAdd(&tmpforces[jforce_pos].y, sh_jforce[threadIdx.x].y);
00555           atomicAdd(&tmpforces[jforce_pos].z, sh_jforce[threadIdx.x].z);
00556           SLOW(atomicAdd(&slow_tmpforces[jforce_pos].x, sh_jforce_slow[threadIdx.x].x);
00557                atomicAdd(&slow_tmpforces[jforce_pos].y, sh_jforce_slow[threadIdx.x].y);
00558                atomicAdd(&slow_tmpforces[jforce_pos].z, sh_jforce_slow[threadIdx.x].z););
00559 #endif
00560         }
00561 
00562       GENPAIRLIST(
00563                   const int size2 = (sh_patch_pair.patch2_size-1)/WARPSIZE+1;
00564                   int pos = (blockj/WARPSIZE) + (blocki/WARPSIZE)*size2;
00565                   exclmasks[sh_patch_pair.exclmask_start+pos].excl[threadIdx.x] = excl;
00566                   if (threadIdx.x == 0) {
00567                     int plist_ind = pos/32;
00568                     unsigned int plist_bit = 1 << (pos % 32);
00569                     atomicOr(&plist[sh_patch_pair.plist_start + plist_ind], plist_bit);
00570                   }
00571                   );
00572       }
00573 
00574     } // for (blockj)
00575 
00576     // Write i-forces
00577     if (blocki + threadIdx.x < sh_patch_pair.patch1_size) {
00578       int iforce_pos = sh_patch_pair.patch1_start + blocki + threadIdx.x;
00579       atomicAdd(&tmpforces[iforce_pos].x, iforce.x);
00580       atomicAdd(&tmpforces[iforce_pos].y, iforce.y);
00581       atomicAdd(&tmpforces[iforce_pos].z, iforce.z);
00582       SLOW(atomicAdd(&slow_tmpforces[iforce_pos].x, iforce_slow.x);
00583            atomicAdd(&slow_tmpforces[iforce_pos].y, iforce_slow.y);
00584            atomicAdd(&slow_tmpforces[iforce_pos].z, iforce_slow.z););
00585     }
00586     // Accumulate total forces for virial (warp synchronous)
00587 #ifdef KEPLER_SHUFFLE
00588     for (int i=WARPSIZE/2;i >= 1;i/=2) {
00589       iforce.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.x, i, WARPSIZE);
00590       iforce.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.y, i, WARPSIZE);
00591       iforce.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce.z, i, WARPSIZE);
00592       SLOW(
00593            iforce_slow.x += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.x, i, WARPSIZE);
00594            iforce_slow.y += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.y, i, WARPSIZE);
00595            iforce_slow.z += WARP_SHUFFLE_XOR(WARP_FULL_MASK, iforce_slow.z, i, WARPSIZE);
00596            );
00597     }
00598     if (threadIdx.x == 0) {
00599       sh_iforcesum[threadIdx.y].x += iforce.x;
00600       sh_iforcesum[threadIdx.y].y += iforce.y;
00601       sh_iforcesum[threadIdx.y].z += iforce.z;
00602       SLOW(
00603            sh_iforcesum[threadIdx.y+NUM_WARP].x += iforce_slow.x;
00604            sh_iforcesum[threadIdx.y+NUM_WARP].y += iforce_slow.y;
00605            sh_iforcesum[threadIdx.y+NUM_WARP].z += iforce_slow.z;
00606            );
00607     }
00608 #else
00609     sh_jforce[threadIdx.x].x = iforce.x;
00610     sh_jforce[threadIdx.x].y = iforce.y;
00611     sh_jforce[threadIdx.x].z = iforce.z;
00612     SLOW(
00613          sh_jforce_slow[threadIdx.x].x = iforce_slow.x;
00614          sh_jforce_slow[threadIdx.x].y = iforce_slow.y;
00615          sh_jforce_slow[threadIdx.x].z = iforce_slow.z;
00616          );
00617     for (int d=1;d < WARPSIZE;d*=2) {
00618       int pos = threadIdx.x + d;
00619       float valx = (pos < WARPSIZE) ? sh_jforce[pos].x : 0.0f;
00620       float valy = (pos < WARPSIZE) ? sh_jforce[pos].y : 0.0f;
00621       float valz = (pos < WARPSIZE) ? sh_jforce[pos].z : 0.0f;
00622       SLOW(
00623            float slow_valx = (pos < WARPSIZE) ? sh_jforce_slow[pos].x : 0.0f;
00624            float slow_valy = (pos < WARPSIZE) ? sh_jforce_slow[pos].y : 0.0f;
00625            float slow_valz = (pos < WARPSIZE) ? sh_jforce_slow[pos].z : 0.0f;
00626            );
00627       sh_jforce[threadIdx.x].x += valx;
00628       sh_jforce[threadIdx.x].y += valy;
00629       sh_jforce[threadIdx.x].z += valz;
00630       SLOW(
00631            sh_jforce_slow[threadIdx.x].x += slow_valx;
00632            sh_jforce_slow[threadIdx.x].y += slow_valy;
00633            sh_jforce_slow[threadIdx.x].z += slow_valz;
00634            );
00635     }
00636     if (threadIdx.x == 0) {
00637       sh_iforcesum[threadIdx.y].x += sh_jforce[threadIdx.x].x;
00638       sh_iforcesum[threadIdx.y].y += sh_jforce[threadIdx.x].y;
00639       sh_iforcesum[threadIdx.y].z += sh_jforce[threadIdx.x].z;
00640       SLOW(
00641            sh_iforcesum[threadIdx.y+NUM_WARP].x += sh_jforce_slow[threadIdx.x].x;
00642            sh_iforcesum[threadIdx.y+NUM_WARP].y += sh_jforce_slow[threadIdx.x].y;
00643            sh_iforcesum[threadIdx.y+NUM_WARP].z += sh_jforce_slow[threadIdx.x].z;
00644            );
00645     }
00646 #endif
00647 
00648   } // for (blocki)
00649 
00650   }
00651 
00652   {
00653 
00654 #ifdef REG_JFORCE
00655 #undef SH_BUF_SIZE
00656 #define SH_BUF_SIZE NUM_WARP*(SLOW(9)+9)*sizeof(float)
00657     __shared__ float sh_buf[NUM_WARP*(SLOW(9)+9)];
00658 #else // ! REG_JFORCE
00659 #undef SH_BUF_SIZE
00660 #define SH_BUF_SIZE NUM_WARP*WARPSIZE*3*sizeof(float)
00661     volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
00662         // Sync here to make sure we can write into shared memory (sh_jforce_2d)
00663         BLOCK_SYNC;
00664 #endif
00665 
00666 #if ENERGY(1+)0
00667         NAME(reduceVariables)<float, SLOW(1+)2, SH_BUF_SIZE>(sh_buf, &tmpvirials[sh_patch_pair.patch1_ind*16 + 9], totalev, totalee, SLOW(totales+)0.0f);
00668 #endif
00669 
00670 #if GENPAIRLIST(1+)0
00671         ENERGY(BLOCK_SYNC);
00672     NAME(reduceVariables)<int, 1, SH_BUF_SIZE>((int *)sh_buf, (int *)&tmpvirials[sh_patch_pair.patch1_ind*16 + 12], nexcluded, 0, 0);
00673 #endif
00674 
00675   // Virials
00676   BLOCK_SYNC;
00677   if (threadIdx.x < SLOW(3+)3 && threadIdx.y == 0) {
00678     float* sh_virials = (float *)sh_iforcesum + (threadIdx.x % 3) + (threadIdx.x/3)*3*NUM_WARP;
00679     float iforcesum = 0.0f;
00680 #pragma unroll
00681     for (int i=0;i < 3*NUM_WARP;i+=3) iforcesum += sh_virials[i];
00682     float vx = iforcesum*sh_patch_pair.offset.x;
00683     float vy = iforcesum*sh_patch_pair.offset.y;
00684     float vz = iforcesum*sh_patch_pair.offset.z;
00685     sh_iforcesum[threadIdx.x].x = vx;
00686     sh_iforcesum[threadIdx.x].y = vy;
00687     sh_iforcesum[threadIdx.x].z = vz;
00688   }
00689   if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
00690     // virials are in sh_virials[0...8] and slow virials in sh_virials[9...17]
00691     float* sh_virials = (float *)sh_iforcesum;
00692     int patch1_ind = sh_patch_pair.patch1_ind;
00693     float *dst = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
00694     atomicAdd(&dst[patch1_ind*16 + (threadIdx.x % 9)], sh_virials[threadIdx.x]);
00695   }
00696 
00697   // Make sure forces are up-to-date in device global memory
00698   __threadfence();
00699   BLOCK_SYNC;
00700 
00701   // Mark patch pair (patch1_ind, patch2_ind) as "done"
00702   int patch1_ind = sh_patch_pair.patch1_ind;
00703   int patch2_ind = sh_patch_pair.patch2_ind;
00704   if (threadIdx.x == 0 && threadIdx.y == 0) {
00705     sh_patch_pair.patch_done[0] = false;
00706     sh_patch_pair.patch_done[1] = false;
00707     //
00708     // global_counters[0]: force_ready_queue
00709     // global_counters[1]: block_order
00710     // global_counters[2...npatches+1]: number of pairs finished for patch i+2
00711     //
00712     unsigned int patch1_num_pairs = sh_patch_pair.patch1_num_pairs;
00713     int patch1_old = atomicInc(&global_counters[patch1_ind+2], patch1_num_pairs-1);
00714     if (patch1_old+1 == patch1_num_pairs) sh_patch_pair.patch_done[0] = true;
00715     if (patch1_ind != patch2_ind) {
00716       unsigned int patch2_num_pairs = sh_patch_pair.patch2_num_pairs;
00717       int patch2_old = atomicInc(&global_counters[patch2_ind+2], patch2_num_pairs-1);
00718       if (patch2_old+1 == patch2_num_pairs) sh_patch_pair.patch_done[1] = true;
00719     }
00720   }
00721   // sync threads so that patch1_done and patch2_done are visible to all threads
00722   BLOCK_SYNC;
00723 
00724   if (sh_patch_pair.patch_done[0]) {
00725 
00726 // #ifndef REG_JFORCE
00727 //     volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
00728 // #endif
00729 #ifndef KEPLER_SHUFFLE
00730     volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
00731     volatile float* sh_slow_buf = NULL;
00732     SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
00733 #endif
00734     NAME(finish_forces_virials)(sh_patch_pair.patch1_start, sh_patch_pair.patch1_size,
00735                                 patch1_ind, atoms, sh_buf,
00736 #ifndef KEPLER_SHUFFLE
00737                                 sh_slow_buf, sh_vcc,
00738 #endif
00739                                 tmpforces, slow_tmpforces, forces, slow_forces,
00740                                 tmpvirials, slow_tmpvirials, virials, slow_virials);
00741 
00742   }
00743 
00744   if (sh_patch_pair.patch_done[1]) {
00745 // #ifndef REG_JFORCE
00746 //     volatile float* sh_buf = (float *)&sh_jforce_2d[0][0];
00747 // #endif
00748 #ifndef KEPLER_SHUFFLE
00749     volatile float* sh_vcc = (volatile float*)&sh_jpq_2d[0][0];
00750     volatile float* sh_slow_buf = NULL;
00751     SLOW(sh_slow_buf = (volatile float*)&sh_jforce_slow_2d[0][0];)
00752 #endif
00753     NAME(finish_forces_virials)(sh_patch_pair.patch2_start, sh_patch_pair.patch2_size,
00754                                 patch2_ind, atoms, sh_buf,
00755 #ifndef KEPLER_SHUFFLE
00756                                 sh_slow_buf, sh_vcc,
00757 #endif
00758                                 tmpforces, slow_tmpforces, forces, slow_forces,
00759                                 tmpvirials, slow_tmpvirials, virials, slow_virials);
00760   }
00761 
00762   if (force_ready_queue != NULL && (sh_patch_pair.patch_done[0] || sh_patch_pair.patch_done[1])) {
00763         // Make sure page-locked host forces are up-to-date
00764 #if __CUDA_ARCH__ < 200
00765     __threadfence();
00766 #else
00767     __threadfence_system();
00768 #endif
00769     BLOCK_SYNC;
00770     // Add patch into "force_ready_queue"
00771     if (threadIdx.x == 0 && threadIdx.y == 0) {
00772         if (sh_patch_pair.patch_done[0]) {
00773         int ind = atomicInc(&global_counters[0], npatches-1);
00774         force_ready_queue[ind] = patch1_ind;
00775       }
00776       if (sh_patch_pair.patch_done[1]) {
00777         int ind = atomicInc(&global_counters[0], npatches-1);
00778         force_ready_queue[ind] = patch2_ind;
00779       }
00780       // Make sure "force_ready_queue" is visible in page-locked host memory
00781 #if __CUDA_ARCH__ < 200
00782       __threadfence();
00783 #else
00784       __threadfence_system();
00785 #endif
00786     }
00787   }
00788 
00789   if (threadIdx.x == 0 && threadIdx.y == 0 && block_order != NULL) {
00790     int old = atomicInc(&global_counters[1], total_block_count-1);
00791     block_order[old] = block_begin + blockIdx.x;
00792   }
00793   }
00794 
00795 }
00796 
00797 //
00798 // Copy patch forces to their final resting place in page-locked host memory
00799 // and reduce virials
00800 //
00801 __device__ __forceinline__ 
00802 static void NAME(finish_forces_virials)(const int start, const int size, const int patch_ind,
00803                                         const atom* atoms,
00804                                         volatile float* sh_buf,
00805 #ifndef KEPLER_SHUFFLE
00806                                         volatile float* sh_slow_buf, volatile float* sh_vcc,
00807 #endif
00808                                         float4* tmpforces, float4* slow_tmpforces,
00809                                         float4* forces, float4* slow_forces,
00810                                         float* tmpvirials, float* slow_tmpvirials, 
00811                                         float* virials, float* slow_virials) {
00812   float vxx = 0.f;
00813   float vxy = 0.f;
00814   float vxz = 0.f;
00815   float vyx = 0.f;
00816   float vyy = 0.f;
00817   float vyz = 0.f;
00818   float vzx = 0.f;
00819   float vzy = 0.f;
00820   float vzz = 0.f;
00821   SLOW(
00822        float slow_vxx = 0.f;
00823        float slow_vxy = 0.f;
00824        float slow_vxz = 0.f;
00825        float slow_vyx = 0.f;
00826        float slow_vyy = 0.f;
00827        float slow_vyz = 0.f;
00828        float slow_vzx = 0.f;
00829        float slow_vzy = 0.f;
00830        float slow_vzz = 0.f;
00831        )
00832   for (int i=threadIdx.x+threadIdx.y*WARPSIZE;i < size;i+=NUM_WARP*WARPSIZE) {
00833     const int p = start+i;
00834     float4 f = tmpforces[p];
00835     forces[p] = f;
00836     float4 pos = ((float4*)atoms)[p];
00837     vxx += f.x * pos.x;
00838     vxy += f.x * pos.y;
00839     vxz += f.x * pos.z;
00840     vyx += f.y * pos.x;
00841     vyy += f.y * pos.y;
00842     vyz += f.y * pos.z;
00843     vzx += f.z * pos.x;
00844     vzy += f.z * pos.y;
00845     vzz += f.z * pos.z;
00846     SLOW(
00847          float4 slow_f = slow_tmpforces[p];
00848          slow_forces[p] = slow_f;
00849          slow_vxx += slow_f.x * pos.x;
00850          slow_vxy += slow_f.x * pos.y;
00851          slow_vxz += slow_f.x * pos.z;
00852          slow_vyx += slow_f.y * pos.x;
00853          slow_vyy += slow_f.y * pos.y;
00854          slow_vyz += slow_f.y * pos.z;
00855          slow_vzx += slow_f.z * pos.x;
00856          slow_vzy += slow_f.z * pos.y;
00857          slow_vzz += slow_f.z * pos.z;
00858          )
00859   }
00860 #ifdef KEPLER_SHUFFLE
00861   // Reduce within warps
00862   for (int i=WARPSIZE/2;i >= 1;i/=2) {
00863     vxx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxx, i, WARPSIZE);
00864     vxy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxy, i, WARPSIZE);
00865     vxz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vxz, i, WARPSIZE);
00866     vyx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyx, i, WARPSIZE);
00867     vyy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyy, i, WARPSIZE);
00868     vyz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vyz, i, WARPSIZE);
00869     vzx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzx, i, WARPSIZE);
00870     vzy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzy, i, WARPSIZE);
00871     vzz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, vzz, i, WARPSIZE);
00872     SLOW(
00873          slow_vxx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxx, i, WARPSIZE);
00874          slow_vxy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxy, i, WARPSIZE);
00875          slow_vxz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vxz, i, WARPSIZE);
00876          slow_vyx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyx, i, WARPSIZE);
00877          slow_vyy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyy, i, WARPSIZE);
00878          slow_vyz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vyz, i, WARPSIZE);
00879          slow_vzx += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzx, i, WARPSIZE);
00880          slow_vzy += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzy, i, WARPSIZE);
00881          slow_vzz += WARP_SHUFFLE_XOR(WARP_FULL_MASK, slow_vzz, i, WARPSIZE);
00882          )
00883   }
00884   // Reduce between warps
00885   // Requires NUM_WARP*(SLOW(9)+9)*sizeof(float) amount of shared memory
00886   if (threadIdx.x == 0) {
00887     sh_buf[threadIdx.y*(SLOW(9)+9) + 0] = vxx;
00888     sh_buf[threadIdx.y*(SLOW(9)+9) + 1] = vxy;
00889     sh_buf[threadIdx.y*(SLOW(9)+9) + 2] = vxz;
00890     sh_buf[threadIdx.y*(SLOW(9)+9) + 3] = vyx;
00891     sh_buf[threadIdx.y*(SLOW(9)+9) + 4] = vyy;
00892     sh_buf[threadIdx.y*(SLOW(9)+9) + 5] = vyz;
00893     sh_buf[threadIdx.y*(SLOW(9)+9) + 6] = vzx;
00894     sh_buf[threadIdx.y*(SLOW(9)+9) + 7] = vzy;
00895     sh_buf[threadIdx.y*(SLOW(9)+9) + 8] = vzz;
00896     SLOW(
00897          sh_buf[threadIdx.y*(SLOW(9)+9) + 9]  = slow_vxx;
00898          sh_buf[threadIdx.y*(SLOW(9)+9) + 10] = slow_vxy;
00899          sh_buf[threadIdx.y*(SLOW(9)+9) + 11] = slow_vxz;
00900          sh_buf[threadIdx.y*(SLOW(9)+9) + 12] = slow_vyx;
00901          sh_buf[threadIdx.y*(SLOW(9)+9) + 13] = slow_vyy;
00902          sh_buf[threadIdx.y*(SLOW(9)+9) + 14] = slow_vyz;
00903          sh_buf[threadIdx.y*(SLOW(9)+9) + 15] = slow_vzx;
00904          sh_buf[threadIdx.y*(SLOW(9)+9) + 16] = slow_vzy;
00905          sh_buf[threadIdx.y*(SLOW(9)+9) + 17] = slow_vzz;
00906          )
00907   }
00908   BLOCK_SYNC;
00909   // Write final virials into global memory
00910   if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
00911     float v = 0.0f;
00912 #pragma unroll
00913     for (int i=0;i < NUM_WARP;i++) v += sh_buf[i*(SLOW(9)+9) + threadIdx.x];
00914     float* dst = (threadIdx.x < 9) ? virials : slow_virials;
00915     const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
00916     int pos = patch_ind*16 + (threadIdx.x % 9);
00917     dst[pos] = v + src[pos];
00918   }
00919 #else // ! KEPLER_SHUFFLE
00920   // We have total of NUM_WARP*WARPSIZE*3 floats, reduce in sets of three
00921   // (NOTE: we do have more shared memory available, so this could be optimized further
00922   //        for pre-Kepler architectures.)
00923   const int t = threadIdx.x + threadIdx.y*WARPSIZE;
00924   volatile float* sh_v1 = &sh_buf[0];
00925   volatile float* sh_v2 = &sh_buf[NUM_WARP*WARPSIZE];
00926   volatile float* sh_v3 = &sh_buf[2*NUM_WARP*WARPSIZE];
00927   SLOW(
00928        volatile float* sh_slow_v1 = &sh_slow_buf[0];
00929        volatile float* sh_slow_v2 = &sh_slow_buf[NUM_WARP*WARPSIZE];
00930        volatile float* sh_slow_v3 = &sh_slow_buf[2*NUM_WARP*WARPSIZE];
00931        )
00932 
00933   // vxx, vxy, vxz
00934   sh_v1[t] = vxx;
00935   sh_v2[t] = vxy;
00936   sh_v3[t] = vxz;
00937   SLOW(
00938        sh_slow_v1[t] = slow_vxx;
00939        sh_slow_v2[t] = slow_vxy;
00940        sh_slow_v3[t] = slow_vxz;
00941        )
00942   for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
00943     int pos = t + d;
00944     float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
00945     float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
00946     float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
00947     SLOW(
00948          float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
00949          float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
00950          float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
00951          )
00952     BLOCK_SYNC;
00953     sh_v1[t] += v1;
00954     sh_v2[t] += v2;
00955     sh_v3[t] += v3;
00956     SLOW(
00957          sh_slow_v1[t] += slow_v1;
00958          sh_slow_v2[t] += slow_v2;
00959          sh_slow_v3[t] += slow_v3;
00960          )
00961     BLOCK_SYNC;
00962   }
00963   if (threadIdx.x == 0 && threadIdx.y == 0) {
00964     sh_vcc[0] = sh_v1[0];
00965     sh_vcc[1] = sh_v2[0];
00966     sh_vcc[2] = sh_v3[0];
00967     SLOW(
00968          sh_vcc[9+0] = sh_slow_v1[0];
00969          sh_vcc[9+1] = sh_slow_v2[0];
00970          sh_vcc[9+2] = sh_slow_v3[0];
00971          )
00972   }
00973   // vyx, vyy, vyz
00974   sh_v1[t] = vyx;
00975   sh_v2[t] = vyy;
00976   sh_v3[t] = vyz;
00977   SLOW(
00978        sh_slow_v1[t] = slow_vyx;
00979        sh_slow_v2[t] = slow_vyy;
00980        sh_slow_v3[t] = slow_vyz;
00981        )
00982   for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
00983     int pos = t + d;
00984     float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
00985     float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
00986     float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
00987     SLOW(
00988          float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
00989          float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
00990          float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
00991          )
00992     BLOCK_SYNC;
00993     sh_v1[t] += v1;
00994     sh_v2[t] += v2;
00995     sh_v3[t] += v3;
00996     SLOW(
00997          sh_slow_v1[t] += slow_v1;
00998          sh_slow_v2[t] += slow_v2;
00999          sh_slow_v3[t] += slow_v3;
01000          )
01001     BLOCK_SYNC;
01002   }
01003   if (threadIdx.x == 0 && threadIdx.y == 0) {
01004     sh_vcc[3] = sh_v1[0];
01005     sh_vcc[4] = sh_v2[0];
01006     sh_vcc[5] = sh_v3[0];
01007     SLOW(
01008          sh_vcc[9+3] = sh_slow_v1[0];
01009          sh_vcc[9+4] = sh_slow_v2[0];
01010          sh_vcc[9+5] = sh_slow_v3[0];
01011          )
01012   }
01013   // vzx, vzy, vzz
01014   sh_v1[t] = vzx;
01015   sh_v2[t] = vzy;
01016   sh_v3[t] = vzz;
01017   SLOW(
01018        sh_slow_v1[t] = slow_vzx;
01019        sh_slow_v2[t] = slow_vzy;
01020        sh_slow_v3[t] = slow_vzz;
01021        )
01022   for (int d=1;d < NUM_WARP*WARPSIZE;d*=2) {
01023     int pos = t + d;
01024     float v1 = (pos < NUM_WARP*WARPSIZE) ? sh_v1[pos] : 0.0f;
01025     float v2 = (pos < NUM_WARP*WARPSIZE) ? sh_v2[pos] : 0.0f;
01026     float v3 = (pos < NUM_WARP*WARPSIZE) ? sh_v3[pos] : 0.0f;
01027     SLOW(
01028          float slow_v1 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v1[pos] : 0.0f;
01029          float slow_v2 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v2[pos] : 0.0f;
01030          float slow_v3 = (pos < NUM_WARP*WARPSIZE) ? sh_slow_v3[pos] : 0.0f;
01031          )
01032     BLOCK_SYNC;
01033     sh_v1[t] += v1;
01034     sh_v2[t] += v2;
01035     sh_v3[t] += v3;
01036     SLOW(
01037          sh_slow_v1[t] += slow_v1;
01038          sh_slow_v2[t] += slow_v2;
01039          sh_slow_v3[t] += slow_v3;
01040          )
01041     BLOCK_SYNC;
01042   }
01043   if (threadIdx.x == 0 && threadIdx.y == 0) {
01044     sh_vcc[6] = sh_v1[0];
01045     sh_vcc[7] = sh_v2[0];
01046     sh_vcc[8] = sh_v3[0];
01047     SLOW(
01048          sh_vcc[9+6] = sh_slow_v1[0];
01049          sh_vcc[9+7] = sh_slow_v2[0];
01050          sh_vcc[9+8] = sh_slow_v3[0];
01051          )
01052   }
01053   // Write final virials and energies into global memory
01054   if (threadIdx.x < SLOW(9+)9 && threadIdx.y == 0) {
01055     float* dst = (threadIdx.x < 9) ? virials : slow_virials;
01056     const float* src = (threadIdx.x < 9) ? tmpvirials : slow_tmpvirials;
01057     int pos = patch_ind*16 + (threadIdx.x % 9);
01058     dst[pos] = sh_vcc[threadIdx.x] + src[pos];
01059   }
01060 #endif // KEPLER_SHUFFLE
01061   ENERGY(
01062          // Write final energies into global memory
01063          if (threadIdx.x < 3 && threadIdx.y == 0) {
01064            int pos = patch_ind*16 + 9 + threadIdx.x;
01065            virials[pos] = tmpvirials[pos];
01066          }
01067          );
01068   GENPAIRLIST(
01069         if (threadIdx.x == 0 && threadIdx.y == 0) {
01070                 int pos = patch_ind*16 + 12;
01071                 virials[pos] = tmpvirials[pos];                 
01072         }
01073         );
01074 } 
01075 
01076 #endif // NAMD_CUDA
01077 

Generated on Mon Nov 20 01:17:11 2017 for NAMD by  doxygen 1.4.7