00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "msmpot_cuda.h"
00022
00023
00024 #define BLOCK_DIM_X 8
00025 #define BLOCK_DIM_Y 2
00026 #define UNROLL_Y 4
00027 #define BLOCK_DIM_Z 8
00028
00029 #define REGSIZE_X BLOCK_DIM_X
00030 #define REGSIZE_Y (BLOCK_DIM_Y * UNROLL_Y)
00031 #define REGSIZE_Z BLOCK_DIM_Z
00032
00033 #define REGION_SIZE (REGSIZE_X * REGSIZE_Y * REGSIZE_Z)
00034
00035 #define LOG_BDIM_X 3
00036 #define LOG_BDIM_Y 1
00037 #define LOG_UNRL_Y 2
00038 #define LOG_BDIM_Z 3
00039
00040 #define LOG_REGS_X LOG_BDIM_X
00041 #define LOG_REGS_Y (LOG_BDIM_Y + LOG_UNRL_Y)
00042 #define LOG_REGS_Z LOG_BDIM_Z
00043
00044 #define LOG_REGSIZE (LOG_REGS_X + LOG_REGS_Y + LOG_REGS_Z)
00045
00046 #define MASK_REGS_X (REGSIZE_X - 1)
00047 #define MASK_REGS_Y (REGSIZE_Y - 1)
00048 #define MASK_REGS_Z (REGSIZE_Z - 1)
00049
00050
00051
00052 #define MAX_BIN_DEPTH 8
00053
00054
00055
00056
00057
00058 __constant__ static int NbrListLen;
00059 __constant__ static int3 NbrList[NBRLIST_MAXLEN];
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091 __global__ static void cuda_shortrange_nonperiodic(
00092 float *regionBase,
00093 int zRegionDim,
00094 float dx,
00095 float dy,
00096 float dz,
00097 float rx0,
00098 float ry0,
00099 float rz0,
00100 float4 *binBase,
00101 int xBinDim,
00102 int yBinDim,
00103 int zBinDim,
00104 int bindepth,
00105 float invbx,
00106 float invby,
00107 float invbz,
00108 float cutoff2,
00109 float invcut
00110 ) {
00111
00112 __shared__ float4 binCache[MAX_BIN_DEPTH];
00113 __shared__ float *myRegionAddr;
00114 __shared__ int3 myBinIndex;
00115
00116 const int xRegionIndex = blockIdx.x;
00117 const int yRegionIndex = blockIdx.y;
00118
00119
00120 const int tid = (threadIdx.z*BLOCK_DIM_Y + threadIdx.y)*BLOCK_DIM_X
00121 + threadIdx.x;
00122
00123
00124 int nbrid;
00125
00126
00127
00128 float gc0, gc1, gc2;
00129 gc1 = invcut * invcut;
00130 gc2 = gc1 * gc1;
00131 gc0 = 1.875f * invcut;
00132 gc1 *= -1.25f * invcut;
00133 gc2 *= 0.375f * invcut;
00134
00135 int zRegionIndex;
00136 for (zRegionIndex=0; zRegionIndex < zRegionDim; zRegionIndex++) {
00137
00138
00139 myRegionAddr = regionBase + ((zRegionIndex*gridDim.y
00140 + yRegionIndex)*gridDim.x + xRegionIndex)*REGION_SIZE;
00141
00142
00143 float x = (REGSIZE_X * xRegionIndex + threadIdx.x) * dx - rx0;
00144 float y = (REGSIZE_Y * yRegionIndex + threadIdx.y) * dy - ry0;
00145 float z = (REGSIZE_Z * zRegionIndex + threadIdx.z) * dz - rz0;
00146
00147
00148 myBinIndex.x = (int) floorf((REGSIZE_X * xRegionIndex
00149 + REGSIZE_X / 2) * dx * invbx);
00150 myBinIndex.y = (int) floorf((REGSIZE_Y * yRegionIndex
00151 + REGSIZE_Y / 2) * dy * invby);
00152 myBinIndex.z = (int) floorf((REGSIZE_Z * zRegionIndex
00153 + REGSIZE_Z / 2) * dz * invbz);
00154
00155 float energy0 = 0.f;
00156 #if UNROLL_Y >= 2
00157 float energy1 = 0.f;
00158 #if UNROLL_Y >= 3
00159 float energy2 = 0.f;
00160 #if UNROLL_Y >= 4
00161 float energy3 = 0.f;
00162 #endif
00163 #endif
00164 #endif
00165
00166 for (nbrid = 0; nbrid < NbrListLen; nbrid++) {
00167
00168 int ib = myBinIndex.x + NbrList[nbrid].x;
00169 int jb = myBinIndex.y + NbrList[nbrid].y;
00170 int kb = myBinIndex.z + NbrList[nbrid].z;
00171
00172 if (ib >= 0 && ib < xBinDim &&
00173 jb >= 0 && jb < yBinDim &&
00174 kb >= 0 && kb < zBinDim) {
00175
00176 int n;
00177
00178
00179 __syncthreads();
00180 if (tid < bindepth) {
00181
00182
00183 float4 *bin = binBase
00184 + (((__mul24(kb, yBinDim) + jb)*xBinDim + ib) * bindepth);
00185
00186 binCache[tid] = bin[tid];
00187 }
00188 __syncthreads();
00189
00190 for (n = 0; n < bindepth; n++) {
00191
00192 float aq = binCache[n].w;
00193 if (0.f == aq) break;
00194
00195 float rx = binCache[n].x - x;
00196 float rz = binCache[n].z - z;
00197 float rxrz2 = rx*rx + rz*rz;
00198 #ifdef CHECK_CYLINDER
00199 if (rxrz2 >= cutoff2) continue;
00200 #endif
00201 float ry = binCache[n].y - y;
00202 float r2 = ry*ry + rxrz2;
00203
00204 if (r2 < cutoff2) {
00205 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00206 energy0 += aq * (rsqrtf(r2) - gr2);
00207 }
00208 #if UNROLL_Y >= 2
00209 ry -= BLOCK_DIM_Y*dy;
00210 r2 = ry*ry + rxrz2;
00211 if (r2 < cutoff2) {
00212 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00213 energy1 += aq * (rsqrtf(r2) - gr2);
00214 }
00215 #if UNROLL_Y >= 3
00216 ry -= BLOCK_DIM_Y*dy;
00217 r2 = ry*ry + rxrz2;
00218 if (r2 < cutoff2) {
00219 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00220 energy2 += aq * (rsqrtf(r2) - gr2);
00221 }
00222 #if UNROLL_Y >= 4
00223 ry -= BLOCK_DIM_Y*dy;
00224 r2 = ry*ry + rxrz2;
00225 if (r2 < cutoff2) {
00226 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00227 energy3 += aq * (rsqrtf(r2) - gr2);
00228 }
00229 #endif
00230 #endif
00231 #endif
00232 }
00233 }
00234
00235 }
00236
00237
00238 #define DSHIFT (LOG_BDIM_X + LOG_BDIM_Y)
00239 #define REGSXY (REGSIZE_X * REGSIZE_Y)
00240 #define BDIMXY (BLOCK_DIM_X * BLOCK_DIM_Y)
00241 #define MASK (BDIMXY - 1)
00242
00243 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) ] = energy0;
00244 #if UNROLL_Y >= 2
00245 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + ( BDIMXY)] = energy1;
00246 #if UNROLL_Y >= 3
00247 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (2*BDIMXY)] = energy2;
00248 #if UNROLL_Y >= 4
00249 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (3*BDIMXY)] = energy3;
00250 #endif
00251 #endif
00252 #endif
00253
00254 }
00255
00256 }
00257
00258
00259
00260
00261
00262
00263 __global__ static void cuda_shortrange(
00264 float *regionBase,
00265 int zRegionDim,
00266 float dx,
00267 float dy,
00268 float dz,
00269 float rx0,
00270 float ry0,
00271 float rz0,
00272 float px,
00273 float py,
00274 float pz,
00275 int isperiodic,
00276 float4 *binBase,
00277 int xBinDim,
00278 int yBinDim,
00279 int zBinDim,
00280 int bindepth,
00281 float invbx,
00282 float invby,
00283 float invbz,
00284 float cutoff2,
00285 float invcut
00286 ) {
00287
00288 __shared__ float4 binCache[MAX_BIN_DEPTH];
00289 __shared__ float *myRegionAddr;
00290 __shared__ int3 myBinIndex;
00291
00292 const int xRegionIndex = blockIdx.x;
00293 const int yRegionIndex = blockIdx.y;
00294
00295
00296 const int tid = (threadIdx.z*BLOCK_DIM_Y + threadIdx.y)*BLOCK_DIM_X
00297 + threadIdx.x;
00298
00299
00300 int nbrid;
00301
00302
00303
00304 float gc0, gc1, gc2;
00305 gc1 = invcut * invcut;
00306 gc2 = gc1 * gc1;
00307 gc0 = 1.875f * invcut;
00308 gc1 *= -1.25f * invcut;
00309 gc2 *= 0.375f * invcut;
00310
00311 int zRegionIndex;
00312 for (zRegionIndex=0; zRegionIndex < zRegionDim; zRegionIndex++) {
00313
00314
00315 myRegionAddr = regionBase + ((zRegionIndex*gridDim.y
00316 + yRegionIndex)*gridDim.x + xRegionIndex)*REGION_SIZE;
00317
00318
00319 float x = (REGSIZE_X * xRegionIndex + threadIdx.x) * dx - rx0;
00320 float y = (REGSIZE_Y * yRegionIndex + threadIdx.y) * dy - ry0;
00321 float z = (REGSIZE_Z * zRegionIndex + threadIdx.z) * dz - rz0;
00322
00323
00324 myBinIndex.x = (int) floorf((REGSIZE_X * xRegionIndex
00325 + REGSIZE_X / 2) * dx * invbx);
00326 myBinIndex.y = (int) floorf((REGSIZE_Y * yRegionIndex
00327 + REGSIZE_Y / 2) * dy * invby);
00328 myBinIndex.z = (int) floorf((REGSIZE_Z * zRegionIndex
00329 + REGSIZE_Z / 2) * dz * invbz);
00330
00331 float energy0 = 0.f;
00332 #if UNROLL_Y >= 2
00333 float energy1 = 0.f;
00334 #if UNROLL_Y >= 3
00335 float energy2 = 0.f;
00336 #if UNROLL_Y >= 4
00337 float energy3 = 0.f;
00338 #endif
00339 #endif
00340 #endif
00341
00342 for (nbrid = 0; nbrid < NbrListLen; nbrid++) {
00343
00344 int ib = myBinIndex.x + NbrList[nbrid].x;
00345 int jb = myBinIndex.y + NbrList[nbrid].y;
00346 int kb = myBinIndex.z + NbrList[nbrid].z;
00347
00348 float xw = 0;
00349 float yw = 0;
00350 float zw = 0;
00351
00352 if (IS_SET_X(isperiodic)) {
00353 #if 1
00354 if (ib < 0) {
00355 ib += xBinDim;
00356 xw -= px;
00357 }
00358 else if (ib >= xBinDim) {
00359 ib -= xBinDim;
00360 xw += px;
00361 }
00362 #else
00363 while (ib < 0) { ib += xBinDim; xw -= px; }
00364 while (ib >= xBinDim) { ib -= xBinDim; xw += px; }
00365 #endif
00366 }
00367 else if (ib < 0 || ib >= xBinDim) continue;
00368
00369 if (IS_SET_Y(isperiodic)) {
00370 #if 1
00371 if (jb < 0) {
00372 jb += yBinDim;
00373 yw -= py;
00374 }
00375 else if (jb >= yBinDim) {
00376 jb -= yBinDim;
00377 yw += py;
00378 }
00379 #else
00380 while (jb < 0) { jb += yBinDim; yw -= py; }
00381 while (jb >= yBinDim) { jb -= yBinDim; yw += py; }
00382 #endif
00383 }
00384 else if (jb < 0 || jb >= yBinDim) continue;
00385
00386 if (IS_SET_Z(isperiodic)) {
00387 #if 1
00388 if (kb < 0) {
00389 kb += zBinDim;
00390 zw -= pz;
00391 }
00392 else if (kb >= zBinDim) {
00393 kb -= zBinDim;
00394 zw += pz;
00395 }
00396 #else
00397 while (kb < 0) { kb += zBinDim; zw -= pz; }
00398 while (kb >= zBinDim) { kb -= zBinDim; zw += pz; }
00399 #endif
00400 }
00401 else if (kb < 0 || kb >= zBinDim) continue;
00402
00403 {
00404 int n;
00405
00406
00407 __syncthreads();
00408 if (tid < bindepth) {
00409
00410
00411 float4 *bin = binBase
00412 + (((__mul24(kb, yBinDim) + jb)*xBinDim + ib) * bindepth);
00413
00414 binCache[tid] = bin[tid];
00415 }
00416 __syncthreads();
00417
00418 for (n = 0; n < bindepth; n++) {
00419
00420 float aq = binCache[n].w;
00421 if (0.f == aq) break;
00422
00423 float rx = (binCache[n].x+xw) - x;
00424 float rz = (binCache[n].z+zw) - z;
00425 float rxrz2 = rx*rx + rz*rz;
00426 #ifdef CHECK_CYLINDER
00427 if (rxrz2 >= cutoff2) continue;
00428 #endif
00429 float ry = (binCache[n].y+yw) - y;
00430 float r2 = ry*ry + rxrz2;
00431
00432 if (r2 < cutoff2) {
00433 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00434 energy0 += aq * (rsqrtf(r2) - gr2);
00435 }
00436 #if UNROLL_Y >= 2
00437 ry -= BLOCK_DIM_Y*dy;
00438 r2 = ry*ry + rxrz2;
00439 if (r2 < cutoff2) {
00440 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00441 energy1 += aq * (rsqrtf(r2) - gr2);
00442 }
00443 #if UNROLL_Y >= 3
00444 ry -= BLOCK_DIM_Y*dy;
00445 r2 = ry*ry + rxrz2;
00446 if (r2 < cutoff2) {
00447 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00448 energy2 += aq * (rsqrtf(r2) - gr2);
00449 }
00450 #if UNROLL_Y >= 4
00451 ry -= BLOCK_DIM_Y*dy;
00452 r2 = ry*ry + rxrz2;
00453 if (r2 < cutoff2) {
00454 float gr2 = gc0 + r2*(gc1 + r2*gc2);
00455 energy3 += aq * (rsqrtf(r2) - gr2);
00456 }
00457 #endif
00458 #endif
00459 #endif
00460 }
00461 }
00462
00463 }
00464
00465
00466 #define DSHIFT (LOG_BDIM_X + LOG_BDIM_Y)
00467 #define REGSXY (REGSIZE_X * REGSIZE_Y)
00468 #define BDIMXY (BLOCK_DIM_X * BLOCK_DIM_Y)
00469 #define MASK (BDIMXY - 1)
00470
00471 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) ] = energy0;
00472 #if UNROLL_Y >= 2
00473 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + ( BDIMXY)] = energy1;
00474 #if UNROLL_Y >= 3
00475 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (2*BDIMXY)] = energy2;
00476 #if UNROLL_Y >= 4
00477 myRegionAddr[(tid>>DSHIFT)*REGSXY + (tid&MASK) + (3*BDIMXY)] = energy3;
00478 #endif
00479 #endif
00480 #endif
00481
00482 }
00483
00484 }
00485
00486
00487
00488
00489
00490
00491
00492 void Msmpot_cuda_cleanup_shortrng(MsmpotCuda *mc) {
00493 cudaFree(mc->dev_bin);
00494 cudaFree(mc->dev_padmap);
00495 free(mc->padmap);
00496 }
00497
00498
00499
00500
00501
00502 int Msmpot_cuda_setup_shortrng(MsmpotCuda *mc) {
00503 Msmpot *msm = mc->msmpot;
00504 const int mx = msm->mx;
00505 const int my = msm->my;
00506 const int mz = msm->mz;
00507 int rmx, rmy, rmz;
00508 int pmx, pmy, pmz;
00509 long pmall;
00510 const int nbins = (msm->nbx * msm->nby * msm->nbz);
00511 int rc;
00512
00513
00514 rmx = (mx >> LOG_REGS_X) + ((mx & MASK_REGS_X) ? 1 : 0);
00515 rmy = (my >> LOG_REGS_Y) + ((my & MASK_REGS_Y) ? 1 : 0);
00516 rmz = (mz >> LOG_REGS_Z) + ((mz & MASK_REGS_Z) ? 1 : 0);
00517
00518
00519 pmx = (rmx << LOG_REGS_X);
00520 pmy = (rmy << LOG_REGS_Y);
00521 pmz = (rmz << LOG_REGS_Z);
00522
00523
00524 pmall = (pmx * pmy) * (long)pmz;
00525 if (mc->maxpm < pmall) {
00526 void *v = realloc(mc->padmap, pmall * sizeof(float));
00527 if (NULL == v) return ERROR(MSMPOT_ERROR_MALLOC);
00528 mc->padmap = (float *) v;
00529 mc->maxpm = pmall;
00530 }
00531 mc->pmx = pmx;
00532 mc->pmy = pmy;
00533 mc->pmz = pmz;
00534
00535 REPORT("Determine bin neighborhood for CUDA short-range part.");
00536 rc = Msmpot_compute_shortrng_bin_neighborhood(msm,
00537 REGSIZE_X * msm->dx, REGSIZE_Y * msm->dy, REGSIZE_Z * msm->dz);
00538 if (rc != MSMPOT_SUCCESS) return ERROR(rc);
00539
00540
00541
00542
00543
00544 if (msm->isperiodic) {
00545 int n;
00546 int ibmax = 0, jbmax = 0, kbmax = 0;
00547 for (n = 0; n < msm->nboff; n++) {
00548 int ib = msm->boff[3*n ];
00549 int jb = msm->boff[3*n+1];
00550 int kb = msm->boff[3*n+2];
00551 if (ib < 0) ib = -ib;
00552 if (ibmax < ib) ibmax = ib;
00553 if (jb < 0) jb = -jb;
00554 if (jbmax < jb) jbmax = jb;
00555 if (kb < 0) kb = -kb;
00556 if (kbmax < kb) kbmax = kb;
00557 }
00558 if (ibmax >= msm->nbx || jbmax >= msm->nby || kbmax >= msm->nbz) {
00559 REPORT("Bin neighborhood is too big for wrapping with CUDA.");
00560 return ERROR(MSMPOT_ERROR_CUDA_SUPPORT);
00561 }
00562 }
00563
00564
00565
00566
00567
00568 if (mc->dev_maxpm < mc->maxpm) {
00569 void *v = NULL;
00570 cudaFree(mc->dev_padmap);
00571 CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00572 cudaMalloc(&v, mc->maxpm * sizeof(float));
00573 CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00574 mc->dev_padmap = (float *) v;
00575 mc->dev_maxpm = mc->maxpm;
00576 }
00577
00578 if (mc->dev_nbins < nbins) {
00579 void *v = NULL;
00580 cudaFree(mc->dev_bin);
00581 CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00582 cudaMalloc(&v, nbins * msm->bindepth * sizeof(float4));
00583 CUERR(MSMPOT_ERROR_CUDA_MALLOC);
00584 mc->dev_bin = (float4 *) v;
00585 mc->dev_nbins = nbins;
00586 }
00587
00588
00589
00590
00591
00592 cudaMemcpyToSymbol(NbrListLen, &(msm->nboff), sizeof(int), 0);
00593 CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00594 cudaMemcpyToSymbol(NbrList, msm->boff, msm->nboff * sizeof(int3), 0);
00595 CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00596
00597 return MSMPOT_SUCCESS;
00598 }
00599
00600
00601 int Msmpot_cuda_compute_shortrng(MsmpotCuda *mc) {
00602 Msmpot *msm = mc->msmpot;
00603 float *epotmap = msm->epotmap;
00604 float *padmap = mc->padmap;
00605 const int nxbins = msm->nbx;
00606 const int nybins = msm->nby;
00607 const int nzbins = msm->nbz;
00608 const int nbins = nxbins * nybins * nzbins;
00609 const int mx = msm->mx;
00610 const int my = msm->my;
00611 const int mz = msm->mz;
00612 const int mxRegions = (mc->pmx >> LOG_REGS_X);
00613 const int myRegions = (mc->pmy >> LOG_REGS_Y);
00614 const int mzRegions = (mc->pmz >> LOG_REGS_Z);
00615 const long pmall = (mc->pmx * mc->pmy) * (long) mc->pmz;
00616 const float cutoff2 = msm->a * msm->a;
00617 const float invcut = 1.f / msm->a;
00618 const float rx0 = msm->px0 - msm->lx0;
00619 const float ry0 = msm->py0 - msm->ly0;
00620 const float rz0 = msm->pz0 - msm->lz0;
00621 int i, j, k;
00622 int mxRegionIndex, myRegionIndex, mzRegionIndex;
00623 int mxOffset, myOffset, mzOffset;
00624 long indexRegion, index;
00625 float *thisRegion;
00626 dim3 gridDim, blockDim;
00627 cudaStream_t shortrng_stream;
00628 int rc = MSMPOT_SUCCESS;
00629
00630 REPORT("Perform atom hashing for CUDA short-range part.");
00631 rc = Msmpot_compute_shortrng_bin_hashing(msm);
00632 if (rc != MSMPOT_SUCCESS) return ERROR(rc);
00633
00634 #ifdef MSMPOT_REPORT
00635 if (msm->isperiodic) {
00636 REPORT("Computing periodic short-range part with CUDA.");
00637 }
00638 else {
00639 REPORT("Computing short-range part with CUDA.");
00640 }
00641 #endif
00642
00643
00644 cudaMemcpy(mc->dev_bin, msm->bin, nbins * msm->bindepth * sizeof(float4),
00645 cudaMemcpyHostToDevice);
00646 CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00647
00648 gridDim.x = mxRegions;
00649 gridDim.y = myRegions;
00650 gridDim.z = 1;
00651 blockDim.x = BLOCK_DIM_X;
00652 blockDim.y = BLOCK_DIM_Y;
00653 blockDim.z = BLOCK_DIM_Z;
00654
00655 cudaStreamCreate(&shortrng_stream);
00656 if (msm->isperiodic) {
00657 cuda_shortrange<<<gridDim, blockDim, 0>>>(
00658 mc->dev_padmap, mzRegions,
00659 msm->dx, msm->dy, msm->dz,
00660 rx0, ry0, rz0,
00661 msm->px, msm->py, msm->pz, msm->isperiodic,
00662 mc->dev_bin, nxbins, nybins, nzbins, msm->bindepth,
00663 msm->invbx, msm->invby, msm->invbz,
00664 cutoff2, invcut);
00665 }
00666 else {
00667 cuda_shortrange_nonperiodic<<<gridDim, blockDim, 0>>>(
00668 mc->dev_padmap, mzRegions,
00669 msm->dx, msm->dy, msm->dz,
00670 rx0, ry0, rz0,
00671 mc->dev_bin, nxbins, nybins, nzbins, msm->bindepth,
00672 msm->invbx, msm->invby, msm->invbz,
00673 cutoff2, invcut);
00674 }
00675 if (msm->nover > 0) {
00676 #ifdef MSMPOT_REPORT
00677 char msg[120];
00678 sprintf(msg, "Extra atoms (%d) from overflowed bins "
00679 "must also be evaluated.", msm->nover);
00680 REPORT(msg);
00681 #endif
00682 rc = Msmpot_compute_shortrng_linklist(mc->msmpot, msm->over, msm->nover);
00683 if (rc) return ERROR(rc);
00684 }
00685 cudaStreamSynchronize(shortrng_stream);
00686 CUERR(MSMPOT_ERROR_CUDA_KERNEL);
00687 cudaDeviceSynchronize();
00688 cudaStreamDestroy(shortrng_stream);
00689
00690
00691 cudaMemcpy(padmap, mc->dev_padmap, pmall * sizeof(float),
00692 cudaMemcpyDeviceToHost);
00693 CUERR(MSMPOT_ERROR_CUDA_MEMCPY);
00694
00695
00696 for (k = 0; k < mz; k++) {
00697 mzRegionIndex = (k >> LOG_REGS_Z);
00698 mzOffset = (k & MASK_REGS_Z);
00699
00700 for (j = 0; j < my; j++) {
00701 myRegionIndex = (j >> LOG_REGS_Y);
00702 myOffset = (j & MASK_REGS_Y);
00703
00704 for (i = 0; i < mx; i++) {
00705 mxRegionIndex = (i >> LOG_REGS_X);
00706 mxOffset = (i & MASK_REGS_X);
00707
00708 thisRegion = padmap
00709 + ((mzRegionIndex * myRegions + myRegionIndex) * (long) mxRegions
00710 + mxRegionIndex) * REGION_SIZE;
00711
00712 indexRegion = (mzOffset * REGSIZE_Y + myOffset) * REGSIZE_X + mxOffset;
00713 index = (k * my + j) * mx + i;
00714
00715 epotmap[index] += thisRegion[indexRegion];
00716 }
00717 }
00718 }
00719
00720 return MSMPOT_SUCCESS;
00721 }