msmpot_cuda_shortrng.cu Source File

00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 2008-2009 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: msmpot_cuda_shortrng.cu,v $
00013  *      $Author: johns $        $Locker:  $             $State: Exp $
00014  *      $Revision: 1.4 $      $Date: 2014/05/27 15:31:50 $
00015  *
00016  ***************************************************************************/
00017 /*
00018  * msmcuda_shortrng.cu
00019  */
00020 
00021 #include "msmpot_cuda.h"
00022 
00023 
00024 #define BLOCK_DIM_X  8
00025 #define BLOCK_DIM_Y  2
00026 #define UNROLL_Y     4
00027 #define BLOCK_DIM_Z  8
00028 
00029 #define REGSIZE_X    BLOCK_DIM_X
00030 #define REGSIZE_Y    (BLOCK_DIM_Y * UNROLL_Y)
00031 #define REGSIZE_Z    BLOCK_DIM_Z
00032 
00033 #define REGION_SIZE  (REGSIZE_X * REGSIZE_Y * REGSIZE_Z)
00034 
00035 #define LOG_BDIM_X   3
00036 #define LOG_BDIM_Y   1
00037 #define LOG_UNRL_Y   2
00038 #define LOG_BDIM_Z   3
00039 
00040 #define LOG_REGS_X   LOG_BDIM_X
00041 #define LOG_REGS_Y   (LOG_BDIM_Y + LOG_UNRL_Y)
00042 #define LOG_REGS_Z   LOG_BDIM_Z
00043 
00044 #define LOG_REGSIZE  (LOG_REGS_X + LOG_REGS_Y + LOG_REGS_Z)
00045 
00046 #define MASK_REGS_X  (REGSIZE_X - 1)
00047 #define MASK_REGS_Y  (REGSIZE_Y - 1)
00048 #define MASK_REGS_Z  (REGSIZE_Z - 1)
00049 
00050 
00051 /* XXX what should this be? */
00052 #define MAX_BIN_DEPTH  8
00053 
00054 
00055 /*
00056  * neighbor list storage uses 64000 bytes
00057  */
00058 __constant__ static int NbrListLen;
00059 __constant__ static int3 NbrList[NBRLIST_MAXLEN];
00060 
00061 
00062 /*
00063  * The following code is adapted from kernel
00064  *   cuda_cutoff_potential_lattice10overlap()
00065  * from source file mgpot_cuda_binsmall.cu from VMD cionize plugin.
00066  *
00067  * potential lattice is decomposed into size 8^3 lattice point "regions"
00068  *
00069  * THIS IMPLEMENTATION:  one thread per lattice point
00070  * thread block size 128 gives 4 thread blocks per region
00071  * kernel is invoked for each x-y plane of regions,
00072  * where gridDim.x is 4*(x region dimension) so that blockIdx.x 
00073  * can absorb the z sub-region index in its 2 lowest order bits
00074  *
00075  * Regions are stored contiguously in memory in row-major order
00076  *
00077  * The bins cover the atom domain.  For nonperiodic dimensions,
00078  * the domain length must be just long enough to include the atoms,
00079  * and the neighborhood of bins is truncated to the domain edges.
00080  * For periodic dimensions, the domain length is preset by the caller,
00081  * and the neighborhood of bins wraps around the domain edges.
00082  *
00083  * The atom coordinates are stored in bins relative to the domain origin.
00084  * The (rx0,ry0,rz0) offset for map points takes this into account.
00085  */
00086 
00087 
00088 /*
00089  * cuda_shortrange_nonperiodic()  - all dimensions must be nonperiodic
00090  */
00091 __global__ static void cuda_shortrange_nonperiodic(
00092     float *regionBase,  /* address of map regions */
00093     int zRegionDim,     /* z region dimension (x, y given by gridDim) */
00094     float dx,           /* map spacing along x-axis */
00095     float dy,           /* map spacing along y-axis */
00096     float dz,           /* map spacing along z-axis */
00097     float rx0,          /* x offset of map points, rx0 = px0 - lx0 */
00098     float ry0,          /* y offset of map points, ry0 = py0 - ly0 */
00099     float rz0,          /* z offset of map points, rz0 = pz0 - lz0 */
00100     float4 *binBase,    /* address of bins */
00101     int xBinDim,        /* x bin dimension */
00102     int yBinDim,        /* y bin dimension */
00103     int zBinDim,        /* z bin dimension */
00104     int bindepth,       /* number of atom slots per bin */
00105     float invbx,        /* inverse of bin length in x */
00106     float invby,        /* inverse of bin length in y */
00107     float invbz,        /* inverse of bin length in z */
00108     float cutoff2,      /* square of cutoff distance */
00109     float invcut        /* inverse of cutoff distance */
00110     ) {
00111 
00112   __shared__ float4 binCache[MAX_BIN_DEPTH];
00113   __shared__ float *myRegionAddr;
00114   __shared__ int3 myBinIndex;
00115 
00116   const int xRegionIndex = blockIdx.x;
00117   const int yRegionIndex = blockIdx.y;
00118 
00119   /* thread id */
00120   const int tid = (threadIdx.z*BLOCK_DIM_Y + threadIdx.y)*BLOCK_DIM_X
00121     + threadIdx.x;
00122 
00123   /* neighbor index */
00124   int nbrid;
00125 
00126   /* constants for TAYLOR2 softening */
00127   /* XXX is it more efficient to read these values from const memory? */
00128   float gc0, gc1, gc2;
00129   gc1 = invcut * invcut;
00130   gc2 = gc1 * gc1;
00131   gc0 = 1.875f * invcut;
00132   gc1 *= -1.25f * invcut;
00133   gc2 *= 0.375f * invcut;
00134 
00135   int zRegionIndex;
00136   for (zRegionIndex=0; zRegionIndex < zRegionDim; zRegionIndex++) {
00137 
00138     /* this is the start of the sub-region indexed by tid */
00139     myRegionAddr = regionBase + ((zRegionIndex*gridDim.y
00140           + yRegionIndex)*gridDim.x + xRegionIndex)*REGION_SIZE;
00141       
00142     /* spatial coordinate of this lattice point */
00143     float x = (REGSIZE_X * xRegionIndex + threadIdx.x) * dx - rx0;
00144     float y = (REGSIZE_Y * yRegionIndex + threadIdx.y) * dy - ry0;
00145     float z = (REGSIZE_Z * zRegionIndex + threadIdx.z) * dz - rz0;
00146 
00147     /* bin number determined by center of region */
00148     myBinIndex.x = (int) floorf((REGSIZE_X * xRegionIndex
00149           + REGSIZE_X / 2) * dx * invbx);
00150     myBinIndex.y = (int) floorf((REGSIZE_Y * yRegionIndex
00151           + REGSIZE_Y / 2) * dy * invby);
00152     myBinIndex.z = (int) floorf((REGSIZE_Z * zRegionIndex
00153           + REGSIZE_Z / 2) * dz * invbz);
00154 
00155     float energy0 = 0.f;
00156 #if UNROLL_Y >= 2
00157     float energy1 = 0.f;
00158 #if UNROLL_Y >= 3
00159     float energy2 = 0.f;
00160 #if UNROLL_Y >= 4
00161     float energy3 = 0.f;
00162 #endif
00163 #endif
00164 #endif
00165 
00166     for (nbrid = 0;  nbrid < NbrListLen;  nbrid++) {
00167 
00168       int ib = myBinIndex.x + NbrList[nbrid].x;
00169       int jb = myBinIndex.y + NbrList[nbrid].y;
00170       int kb = myBinIndex.z + NbrList[nbrid].z;
00171 
00172       if (ib >= 0 && ib < xBinDim &&
00173           jb >= 0 && jb < yBinDim &&
00174           kb >= 0 && kb < zBinDim) {
00175 
00176         int n;
00177 
00178         /* thread block caches one bin */
00179         __syncthreads();
00180         if (tid < bindepth) {
00181 
00182           /* determine global memory location of atom bin */
00183           float4 *bin = binBase
00184             + (((__mul24(kb, yBinDim) + jb)*xBinDim + ib) * bindepth);
00185 
00186           binCache[tid] = bin[tid];
00187         }
00188         __syncthreads();
00189 
00190         for (n = 0;  n < bindepth;  n++) {
00191 
00192           float aq = binCache[n].w;
00193           if (0.f == aq) break;  /* no more atoms in bin */
00194 
00195           float rx = binCache[n].x - x;
00196           float rz = binCache[n].z - z;
00197           float rxrz2 = rx*rx + rz*rz;
00198 #ifdef CHECK_CYLINDER
00199           if (rxrz2 >= cutoff2) continue;
00200 #endif
00201           float ry = binCache[n].y - y;
00202           float r2 = ry*ry + rxrz2;
00203 
00204           if (r2 < cutoff2) {
00205             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00206             energy0 += aq * (rsqrtf(r2) - gr2);
00207           }
00208 #if UNROLL_Y >= 2
00209           ry -= BLOCK_DIM_Y*dy;
00210           r2 = ry*ry + rxrz2;
00211           if (r2 < cutoff2) {
00212             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00213             energy1 += aq * (rsqrtf(r2) - gr2);
00214           }
00215 #if UNROLL_Y >= 3
00216           ry -= BLOCK_DIM_Y*dy;
00217           r2 = ry*ry + rxrz2;
00218           if (r2 < cutoff2) {
00219             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00220             energy2 += aq * (rsqrtf(r2) - gr2);
00221           }
00222 #if UNROLL_Y >= 4
00223           ry -= BLOCK_DIM_Y*dy;
00224           r2 = ry*ry + rxrz2;
00225           if (r2 < cutoff2) {
00226             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00227             energy3 += aq * (rsqrtf(r2) - gr2);
00228           }
00229 #endif
00230 #endif
00231 #endif
00232         } /* end loop over atoms in bin */
00233       } /* end if bin in domain */
00234 
00235     } /* end loop over neighbor list */
00236 
00237     /* store into global memory */
00238 #define DSHIFT  (LOG_BDIM_X + LOG_BDIM_Y)
00239 #define REGSXY  (REGSIZE_X * REGSIZE_Y)
00240 #define BDIMXY  (BLOCK_DIM_X * BLOCK_DIM_Y)
00241 #define MASK    (BDIMXY - 1)
00242 
00243     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK)             ] = energy0;
00244 #if UNROLL_Y >= 2
00245     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (  BDIMXY)] = energy1;
00246 #if UNROLL_Y >= 3
00247     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (2*BDIMXY)] = energy2;
00248 #if UNROLL_Y >= 4
00249     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (3*BDIMXY)] = energy3;
00250 #endif
00251 #endif
00252 #endif
00253 
00254   } /* end loop over zRegionIndex */
00255 
00256 }
00257 /* end cuda_shortrange_nonperiodic() */
00258 
00259 
00260 /*
00261  * cuda_shortrange()  - periodic dimensions wrap bin neighborhood
00262  */
00263 __global__ static void cuda_shortrange(
00264     float *regionBase,  /* address of map regions */
00265     int zRegionDim,     /* z region dimension (x, y given by gridDim) */
00266     float dx,           /* map spacing along x-axis */
00267     float dy,           /* map spacing along y-axis */
00268     float dz,           /* map spacing along z-axis */
00269     float rx0,          /* x offset of map points, rx0 = px0 - lx0 */
00270     float ry0,          /* y offset of map points, ry0 = py0 - ly0 */
00271     float rz0,          /* z offset of map points, rz0 = pz0 - lz0 */
00272     float px,           /* domain length along x dimension */
00273     float py,           /* domain length along y dimension */
00274     float pz,           /* domain length along z dimension */
00275     int isperiodic,     /* bit flags for periodicity in each dimension */
00276     float4 *binBase,    /* address of bins */
00277     int xBinDim,        /* x bin dimension */
00278     int yBinDim,        /* y bin dimension */
00279     int zBinDim,        /* z bin dimension */
00280     int bindepth,       /* number of atom slots per bin */
00281     float invbx,        /* inverse of bin length in x */
00282     float invby,        /* inverse of bin length in y */
00283     float invbz,        /* inverse of bin length in z */
00284     float cutoff2,      /* square of cutoff distance */
00285     float invcut        /* inverse of cutoff distance */
00286     ) {
00287 
00288   __shared__ float4 binCache[MAX_BIN_DEPTH];
00289   __shared__ float *myRegionAddr;
00290   __shared__ int3 myBinIndex;
00291 
00292   const int xRegionIndex = blockIdx.x;
00293   const int yRegionIndex = blockIdx.y;
00294 
00295   /* thread id */
00296   const int tid = (threadIdx.z*BLOCK_DIM_Y + threadIdx.y)*BLOCK_DIM_X
00297     + threadIdx.x;
00298 
00299   /* neighbor index */
00300   int nbrid;
00301 
00302   /* constants for TAYLOR2 softening */
00303   /* XXX is it more efficient to read these values from const memory? */
00304   float gc0, gc1, gc2;
00305   gc1 = invcut * invcut;
00306   gc2 = gc1 * gc1;
00307   gc0 = 1.875f * invcut;
00308   gc1 *= -1.25f * invcut;
00309   gc2 *= 0.375f * invcut;
00310 
00311   int zRegionIndex;
00312   for (zRegionIndex=0; zRegionIndex < zRegionDim; zRegionIndex++) {
00313 
00314     /* this is the start of the sub-region indexed by tid */
00315     myRegionAddr = regionBase + ((zRegionIndex*gridDim.y
00316           + yRegionIndex)*gridDim.x + xRegionIndex)*REGION_SIZE;
00317       
00318     /* spatial coordinate of this lattice point */
00319     float x = (REGSIZE_X * xRegionIndex + threadIdx.x) * dx - rx0;
00320     float y = (REGSIZE_Y * yRegionIndex + threadIdx.y) * dy - ry0;
00321     float z = (REGSIZE_Z * zRegionIndex + threadIdx.z) * dz - rz0;
00322 
00323     /* bin number determined by center of region */
00324     myBinIndex.x = (int) floorf((REGSIZE_X * xRegionIndex
00325           + REGSIZE_X / 2) * dx * invbx);
00326     myBinIndex.y = (int) floorf((REGSIZE_Y * yRegionIndex
00327           + REGSIZE_Y / 2) * dy * invby);
00328     myBinIndex.z = (int) floorf((REGSIZE_Z * zRegionIndex
00329           + REGSIZE_Z / 2) * dz * invbz);
00330 
00331     float energy0 = 0.f;
00332 #if UNROLL_Y >= 2
00333     float energy1 = 0.f;
00334 #if UNROLL_Y >= 3
00335     float energy2 = 0.f;
00336 #if UNROLL_Y >= 4
00337     float energy3 = 0.f;
00338 #endif
00339 #endif
00340 #endif
00341 
00342     for (nbrid = 0;  nbrid < NbrListLen;  nbrid++) {
00343 
00344       int ib = myBinIndex.x + NbrList[nbrid].x;
00345       int jb = myBinIndex.y + NbrList[nbrid].y;
00346       int kb = myBinIndex.z + NbrList[nbrid].z;
00347 
00348       float xw = 0;  /* periodic offset for wrapping coordinates */
00349       float yw = 0;
00350       float zw = 0;
00351 
00352       if (IS_SET_X(isperiodic)) {
00353 #if 1
00354         if (ib < 0) {
00355           ib += xBinDim;
00356           xw -= px;
00357         }
00358         else if (ib >= xBinDim) {
00359           ib -= xBinDim;
00360           xw += px;
00361         }
00362 #else
00363         while (ib < 0)        { ib += xBinDim;  xw -= px; }
00364         while (ib >= xBinDim) { ib -= xBinDim;  xw += px; }
00365 #endif
00366       }
00367       else if (ib < 0 || ib >= xBinDim) continue;
00368 
00369       if (IS_SET_Y(isperiodic)) {
00370 #if 1
00371         if (jb < 0) {
00372           jb += yBinDim;
00373           yw -= py;
00374         }
00375         else if (jb >= yBinDim) {
00376           jb -= yBinDim;
00377           yw += py;
00378         }
00379 #else
00380         while (jb < 0)        { jb += yBinDim;  yw -= py; }
00381         while (jb >= yBinDim) { jb -= yBinDim;  yw += py; }
00382 #endif
00383       }
00384       else if (jb < 0 || jb >= yBinDim) continue;
00385 
00386       if (IS_SET_Z(isperiodic)) {
00387 #if 1
00388         if (kb < 0) {
00389           kb += zBinDim;
00390           zw -= pz;
00391         }
00392         else if (kb >= zBinDim) {
00393           kb -= zBinDim;
00394           zw += pz;
00395         }
00396 #else
00397         while (kb < 0)        { kb += zBinDim;  zw -= pz; }
00398         while (kb >= zBinDim) { kb -= zBinDim;  zw += pz; }
00399 #endif
00400       }
00401       else if (kb < 0 || kb >= zBinDim) continue;
00402 
00403       {
00404         int n;
00405 
00406         /* thread block caches one bin */
00407         __syncthreads();
00408         if (tid < bindepth) {
00409 
00410           /* determine global memory location of atom bin */
00411           float4 *bin = binBase
00412             + (((__mul24(kb, yBinDim) + jb)*xBinDim + ib) * bindepth);
00413 
00414           binCache[tid] = bin[tid];
00415         }
00416         __syncthreads();
00417 
00418         for (n = 0;  n < bindepth;  n++) {
00419 
00420           float aq = binCache[n].w;
00421           if (0.f == aq) break;  /* no more atoms in bin */
00422 
00423           float rx = (binCache[n].x+xw) - x;
00424           float rz = (binCache[n].z+zw) - z;
00425           float rxrz2 = rx*rx + rz*rz;
00426 #ifdef CHECK_CYLINDER
00427           if (rxrz2 >= cutoff2) continue;
00428 #endif
00429           float ry = (binCache[n].y+yw) - y;
00430           float r2 = ry*ry + rxrz2;
00431 
00432           if (r2 < cutoff2) {
00433             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00434             energy0 += aq * (rsqrtf(r2) - gr2);
00435           }
00436 #if UNROLL_Y >= 2
00437           ry -= BLOCK_DIM_Y*dy;
00438           r2 = ry*ry + rxrz2;
00439           if (r2 < cutoff2) {
00440             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00441             energy1 += aq * (rsqrtf(r2) - gr2);
00442           }
00443 #if UNROLL_Y >= 3
00444           ry -= BLOCK_DIM_Y*dy;
00445           r2 = ry*ry + rxrz2;
00446           if (r2 < cutoff2) {
00447             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00448             energy2 += aq * (rsqrtf(r2) - gr2);
00449           }
00450 #if UNROLL_Y >= 4
00451           ry -= BLOCK_DIM_Y*dy;
00452           r2 = ry*ry + rxrz2;
00453           if (r2 < cutoff2) {
00454             float gr2 = gc0 + r2*(gc1 + r2*gc2);
00455             energy3 += aq * (rsqrtf(r2) - gr2);
00456           }
00457 #endif
00458 #endif
00459 #endif
00460         } /* end loop over atoms in bin */
00461       } /* end if bin in domain */
00462 
00463     } /* end loop over neighbor list */
00464 
00465     /* store into global memory */
00466 #define DSHIFT  (LOG_BDIM_X + LOG_BDIM_Y)
00467 #define REGSXY  (REGSIZE_X * REGSIZE_Y)
00468 #define BDIMXY  (BLOCK_DIM_X * BLOCK_DIM_Y)
00469 #define MASK    (BDIMXY - 1)
00470 
00471     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK)             ] = energy0;
00472 #if UNROLL_Y >= 2
00473     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (  BDIMXY)] = energy1;
00474 #if UNROLL_Y >= 3
00475     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (2*BDIMXY)] = energy2;
00476 #if UNROLL_Y >= 4
00477     myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (3*BDIMXY)] = energy3;
00478 #endif
00479 #endif
00480 #endif
00481 
00482   } /* end loop over zRegionIndex */
00483 
00484 }
00485 /* end cuda_shortrange() */
00486 
00487 
00488 
00489 /*
00490  * call when finished
00491  */
00492 void Msmpot_cuda_cleanup_shortrng(MsmpotCuda *mc) {
00493   cudaFree(mc->dev_bin);
00494   cudaFree(mc->dev_padmap);
00495   free(mc->padmap);
00496 }
00497 
00498 
00499 /*
00500  * call once or whenever parameters are changed
00501  */
00502 int Msmpot_cuda_setup_shortrng(MsmpotCuda *mc) {
00503   Msmpot *msm = mc->msmpot;
00504   const int mx = msm->mx;
00505   const int my = msm->my;
00506   const int mz = msm->mz;
00507   int rmx, rmy, rmz;
00508   int pmx, pmy, pmz;
00509   long pmall;
00510   const int nbins = (msm->nbx * msm->nby * msm->nbz);
00511   int rc;
00512 
00513   /* count "regions" of map points in each dimension, rounding up */
00514   rmx = (mx >> LOG_REGS_X) + ((mx & MASK_REGS_X) ? 1 : 0);
00515   rmy = (my >> LOG_REGS_Y) + ((my & MASK_REGS_Y) ? 1 : 0);
00516   rmz = (mz >> LOG_REGS_Z) + ((mz & MASK_REGS_Z) ? 1 : 0);
00517 
00518   /* padded epotmap dimensions */
00519   pmx = (rmx << LOG_REGS_X);
00520   pmy = (rmy << LOG_REGS_Y);
00521   pmz = (rmz << LOG_REGS_Z);
00522 
00523   /* allocate space for padded epotmap */
00524   pmall = (pmx * pmy) * (long)pmz;
00525   if (mc->maxpm < pmall) {
00526     void *v = realloc(mc->padmap, pmall * sizeof(float));
00527     if (NULL == v) return ERROR(MSMPOT_ERROR_MALLOC);
00528     mc->padmap = (float *) v;
00529     mc->maxpm = pmall;
00530   }
00531   mc->pmx = pmx;
00532   mc->pmy = pmy;
00533   mc->pmz = pmz;
00534 
00535   REPORT("Determine bin neighborhood for CUDA short-range part.");
00536   rc = Msmpot_compute_shortrng_bin_neighborhood(msm,
00537       REGSIZE_X * msm->dx, REGSIZE_Y * msm->dy, REGSIZE_Z * msm->dz);
00538   if (rc != MSMPOT_SUCCESS) return ERROR(rc);
00539 
00540   /*
00541    * protect CUDA periodic case against having to wrap multiple times
00542    * around the domain for an atom bin
00543    */
00544   if (msm->isperiodic) {
00545     int n;
00546     int ibmax = 0, jbmax = 0, kbmax = 0;
00547     for (n = 0;  n < msm->nboff;  n++) {
00548       int ib = msm->boff[3*n  ];
00549       int jb = msm->boff[3*n+1];
00550       int kb = msm->boff[3*n+2];
00551       if (ib < 0)      ib = -ib;
00552       if (ibmax < ib)  ibmax = ib;
00553       if (jb < 0)      jb = -jb;
00554       if (jbmax < jb)  jbmax = jb;
00555       if (kb < 0)      kb = -kb;
00556       if (kbmax < kb)  kbmax = kb;
00557     }
00558     if (ibmax >= msm->nbx || jbmax >= msm->nby || kbmax >= msm->nbz) {
00559       REPORT("Bin neighborhood is too big for wrapping with CUDA.");
00560       return ERROR(MSMPOT_ERROR_CUDA_SUPPORT);
00561     }
00562   }
00563 
00564   /*
00565    * allocate CUDA device memory
00566    * (for now make CUDA arrays same length as host arrays)
00567    */
00568   if (mc->dev_maxpm < mc->maxpm) {
00569     void *v = NULL;
00570     cudaFree(mc->dev_padmap);
00571     CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00572     cudaMalloc(&v, mc->maxpm * sizeof(float));
00573     CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00574     mc->dev_padmap = (float *) v;
00575     mc->dev_maxpm = mc->maxpm;
00576   }
00577 
00578   if (mc->dev_nbins < nbins) {
00579     void *v = NULL;
00580     cudaFree(mc->dev_bin);
00581     CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00582     cudaMalloc(&v, nbins * msm->bindepth * sizeof(float4));
00583     CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00584     mc->dev_bin = (float4 *) v;
00585     mc->dev_nbins = nbins;
00586   }
00587 
00588   /*
00589    * copy region neighborhood atom bin index offsets
00590    * to device constant memory
00591    */
00592   cudaMemcpyToSymbol(NbrListLen, &(msm->nboff), sizeof(int), 0);
00593   CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00594   cudaMemcpyToSymbol(NbrList, msm->boff, msm->nboff * sizeof(int3), 0);
00595   CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00596 
00597   return MSMPOT_SUCCESS;
00598 }
00599 
00600 
00601 int Msmpot_cuda_compute_shortrng(MsmpotCuda *mc) {
00602   Msmpot *msm = mc->msmpot;
00603   float *epotmap = msm->epotmap;
00604   float *padmap = mc->padmap;
00605   const int nxbins = msm->nbx;
00606   const int nybins = msm->nby;
00607   const int nzbins = msm->nbz;
00608   const int nbins = nxbins * nybins * nzbins;
00609   const int mx = msm->mx;
00610   const int my = msm->my;
00611   const int mz = msm->mz;
00612   const int mxRegions = (mc->pmx >> LOG_REGS_X);
00613   const int myRegions = (mc->pmy >> LOG_REGS_Y);
00614   const int mzRegions = (mc->pmz >> LOG_REGS_Z);
00615   const long pmall = (mc->pmx * mc->pmy) * (long) mc->pmz;
00616   const float cutoff2 = msm->a * msm->a;
00617   const float invcut = 1.f / msm->a;
00618   const float rx0 = msm->px0 - msm->lx0;  /* translation for grid points */
00619   const float ry0 = msm->py0 - msm->ly0;
00620   const float rz0 = msm->pz0 - msm->lz0;
00621   int i, j, k;
00622   int mxRegionIndex, myRegionIndex, mzRegionIndex;
00623   int mxOffset, myOffset, mzOffset;
00624   long indexRegion, index;
00625   float *thisRegion;
00626   dim3 gridDim, blockDim;
00627   cudaStream_t shortrng_stream;
00628   int rc = MSMPOT_SUCCESS;
00629 
00630   REPORT("Perform atom hashing for CUDA short-range part.");
00631   rc = Msmpot_compute_shortrng_bin_hashing(msm);
00632   if (rc != MSMPOT_SUCCESS) return ERROR(rc);
00633 
00634 #ifdef MSMPOT_REPORT
00635   if (msm->isperiodic) {
00636     REPORT("Computing periodic short-range part with CUDA.");
00637   }
00638   else {
00639     REPORT("Computing short-range part with CUDA.");
00640   }
00641 #endif
00642 
00643   /* copy atom bins to device */
00644   cudaMemcpy(mc->dev_bin, msm->bin, nbins * msm->bindepth * sizeof(float4),
00645       cudaMemcpyHostToDevice);
00646   CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00647 
00648   gridDim.x = mxRegions;
00649   gridDim.y = myRegions;
00650   gridDim.z = 1;
00651   blockDim.x = BLOCK_DIM_X;
00652   blockDim.y = BLOCK_DIM_Y;
00653   blockDim.z = BLOCK_DIM_Z;
00654 
00655   cudaStreamCreate(&shortrng_stream);  /* asynchronously invoke CUDA kernel */
00656   if (msm->isperiodic) {
00657     cuda_shortrange<<<gridDim, blockDim, 0>>>(
00658         mc->dev_padmap, mzRegions,
00659         msm->dx, msm->dy, msm->dz,
00660         rx0, ry0, rz0,
00661         msm->px, msm->py, msm->pz, msm->isperiodic,
00662         mc->dev_bin, nxbins, nybins, nzbins, msm->bindepth,
00663         msm->invbx, msm->invby, msm->invbz,
00664         cutoff2, invcut);
00665   }
00666   else {
00667     cuda_shortrange_nonperiodic<<<gridDim, blockDim, 0>>>(
00668         mc->dev_padmap, mzRegions,
00669         msm->dx, msm->dy, msm->dz,
00670         rx0, ry0, rz0,
00671         mc->dev_bin, nxbins, nybins, nzbins, msm->bindepth,
00672         msm->invbx, msm->invby, msm->invbz,
00673         cutoff2, invcut);
00674   }
00675   if (msm->nover > 0) {  /* call CPU to concurrently compute extra atoms */
00676 #ifdef MSMPOT_REPORT
00677     char msg[120];
00678     sprintf(msg, "Extra atoms (%d) from overflowed bins "
00679         "must also be evaluated.", msm->nover);
00680     REPORT(msg);
00681 #endif
00682     rc = Msmpot_compute_shortrng_linklist(mc->msmpot, msm->over, msm->nover);
00683     if (rc) return ERROR(rc);
00684   }
00685   cudaStreamSynchronize(shortrng_stream);
00686   CUERR(MSMPOT_ERROR_CUDA_KERNEL);
00687   cudaDeviceSynchronize();
00688   cudaStreamDestroy(shortrng_stream);
00689 
00690   /* copy result regions from CUDA device */
00691   cudaMemcpy(padmap, mc->dev_padmap, pmall * sizeof(float),
00692       cudaMemcpyDeviceToHost);
00693   CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00694 
00695   /* transpose regions from padEpotmap and add into result epotmap */
00696   for (k = 0;  k < mz;  k++) {
00697     mzRegionIndex = (k >> LOG_REGS_Z);
00698     mzOffset = (k & MASK_REGS_Z);
00699 
00700     for (j = 0;  j < my;  j++) {
00701       myRegionIndex = (j >> LOG_REGS_Y);
00702       myOffset = (j & MASK_REGS_Y);
00703 
00704       for (i = 0;  i < mx;  i++) {
00705         mxRegionIndex = (i >> LOG_REGS_X);
00706         mxOffset = (i & MASK_REGS_X);
00707 
00708         thisRegion = padmap
00709           + ((mzRegionIndex * myRegions + myRegionIndex) * (long) mxRegions
00710               + mxRegionIndex) * REGION_SIZE;
00711 
00712         indexRegion = (mzOffset * REGSIZE_Y + myOffset) * REGSIZE_X + mxOffset;
00713         index = (k * my + j) * mx + i;
00714 
00715         epotmap[index] += thisRegion[indexRegion];
00716       }
00717     }
00718   }
00719 
00720   return MSMPOT_SUCCESS;
00721 }