ComputePmeCUDAMgr.C

Go to the documentation of this file.
00001 #include <vector>
00002 #include <numeric>
00003 #include <algorithm>
00004 #include "Node.h"
00005 #include "PatchMap.h"
00006 #include "WorkDistrib.h"
00007 #include "Priorities.h"
00008 
00009 #include "CudaUtils.h"
00010 
00011 #include "SimParameters.h"
00012 #include "CudaPmeSolverUtil.h"
00013 
00014 #include "ComputePmeCUDAMgr.h"
00015 
00016 #include "CudaPmeSolver.h"
00017 #include "ComputePmeCUDA.h"
00018 
00019 #include "DeviceCUDA.h"
00020 #ifdef NAMD_CUDA
00021 #ifdef WIN32
00022 #define __thread __declspec(thread)
00023 #endif
00024 extern __thread DeviceCUDA *deviceCUDA;
00025 
00026 void createStream(cudaStream_t& stream) {
00027 #if CUDA_VERSION >= 5050
00028   int leastPriority, greatestPriority;
00029   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
00030   cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority));
00031   // cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,leastPriority));
00032 #else
00033   cudaCheck(cudaStreamCreate(&stream));
00034 #endif
00035 }
00036 
00037 //
00038 // CUDA implementation of atom storage
00039 //
00040 class CudaPmeAtomStorage : public PmeAtomStorage {
00041 public:
00042   CudaPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
00043   ~CudaPmeAtomStorage() {
00044     if (atom != NULL) dealloc_((void *)atom);
00045     if (atomIndex != NULL) dealloc_((void *)atomIndex);
00046     if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
00047     if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
00048   }
00049 private:
00050   
00051   // Allocate array of size bytes
00052   void* alloc_(const int size) {
00053     void* p;
00054     cudaCheck(cudaMallocHost(&p, size));
00055     return p;
00056   }
00057 
00058   // Deallocate array
00059   void dealloc_(void *p) {
00060     cudaCheck(cudaFreeHost(p));
00061   }
00062 
00063 };
00064 
00065 //
00066 // CPU implementation of atom storage
00067 //
00068 class CpuPmeAtomStorage : public PmeAtomStorage {
00069 public:
00070   CpuPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
00071   ~CpuPmeAtomStorage() {
00072     if (atom != NULL) dealloc_((void *)atom);
00073     if (atomIndex != NULL) dealloc_((void *)atomIndex);
00074     if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
00075     if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
00076   }
00077 private:
00078   
00079   // Allocate array of size bytes
00080   void* alloc_(const int size) {
00081     return (void *)(new char[size]);
00082   }
00083 
00084   // Deallocate array
00085   void dealloc_(void *p) {
00086     delete [] (char *)p;
00087   }
00088 
00089 };
00090 
00091 PmeAtomFiler::PmeAtomFiler() {
00092   for (int p=0;p < 10;p++) {
00093     pencil[p] = NULL;
00094     pencilCapacity[p] = 0;
00095   }  
00096 }
00097 PmeAtomFiler::PmeAtomFiler(CkMigrateMessage *m) {
00098   for (int p=0;p < 10;p++) {
00099     pencil[p] = NULL;
00100     pencilCapacity[p] = 0;
00101   }
00102 }
00103 PmeAtomFiler::~PmeAtomFiler() {
00104   for (int p=0;p < 10;p++) {
00105     if (pencil[p] != NULL) delete [] pencil[p];
00106   }
00107 }
00108 
00109 //
00110 // File atoms into PME pencils. Each atom can belong to maximum 9 pencils
00111 // (oy, oz) = origin of the atoms
00112 // (miny, minz) = grid minimum corner for this patch
00113 // NOTE: This method can only be called locally from the same Pe
00114 //
00115 void PmeAtomFiler::fileAtoms(const int numAtoms, const CudaAtom* atoms, Lattice &lattice, const PmeGrid &pmeGrid,
00116   const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi) {
00117 
00118   // Make sure there's enough room in the pencil arrays
00119   for (int p=0;p < 10;p++) {
00120     if (pencil[p] != NULL && pencilCapacity[p] < numAtoms) {
00121       delete [] pencil[p];
00122       pencil[p] = NULL;
00123       pencilCapacity[p] = 0;
00124     }
00125     if (pencil[p] == NULL) {
00126       int newSize = (int)(numAtoms*1.5);
00127       pencil[p] = new int[newSize];
00128       pencilCapacity[p] = newSize;
00129     }
00130     pencilSize[p] = 0;
00131   }
00132 
00133   const float recip11 = lattice.a_r().x;
00134   const float recip22 = lattice.b_r().y;
00135   const float recip33 = lattice.c_r().z;
00136   const int order1 = pmeGrid.order - 1;
00137   const int K1 = pmeGrid.K1;
00138   const int K2 = pmeGrid.K2;
00139   const int K3 = pmeGrid.K3;
00140   const int yBlocks = pmeGrid.yBlocks;
00141   const int zBlocks = pmeGrid.zBlocks;
00142 
00143   for (int i=0;i < numAtoms;i++) {
00144     float frx, fry, frz;
00145     // PmeRealSpaceCompute::calcGridCoord(atoms[i].uix, atoms[i].uiy, atoms[i].uiz,
00146     //   K1, K2, K3, frx, fry, frz);
00147     PmeRealSpaceCompute::calcGridCoord(atoms[i].x, atoms[i].y, atoms[i].z, K1, K2, K3, frx, fry, frz);
00148     // Charge is spread in the region [y0 ... y0+order-1] x [z0 ... z0+order-1]
00149     int y0 = (int)fry;
00150     int z0 = (int)frz;
00151     if (y0 < 0 || y0 >= K2 || z0 < 0 || z0 >= K3) {
00152       // Add to "Stray pencil" and skip to next atom
00153       pencil[9][pencilSize[9]++] = i;
00154       continue;
00155       // fprintf(stderr, "%lf %lf %lf\n", atoms[i].x, atoms[i].y, atoms[i].z);
00156       // NAMD_bug("PmeAtomFiler::fileAtoms, charge out of bounds");
00157     }
00158     // Calculate pencil index for the four corners of the order X order area
00159     // The corners determine the pencil indices for this atom.
00160     int occupied = 0;
00161     int plist[4];
00162 #pragma unroll
00163     for (int j=0;j < 4;j++) {
00164 
00165       int py = getPencilIndexY(pmeGrid, (y0 + (j%2)*order1) % K2) - pencilIndexY;
00166       if (py < ylo) py += yBlocks;
00167       if (py > yhi) py -= yBlocks;
00168 
00169       int pz = getPencilIndexZ(pmeGrid, (z0 + (j/2)*order1) % K3) - pencilIndexZ;
00170       if (pz < zlo) pz += zBlocks;
00171       if (pz > zhi) pz -= zBlocks;
00172 
00173       if (py < ylo || py > yhi || pz < zlo || pz > zhi) {
00174         // Add to "Stray pencil" and skip to next atom
00175         pencil[9][pencilSize[9]++] = i;
00176         goto breakjloop;
00177         // fprintf(stderr, "py %d [%d ... %d] pz %d [%d ... %d]\n", pz, zlo, zhi);
00178         // NAMD_bug("PmeAtomFiler::fileAtoms, py,pz outside bounds");
00179       }
00180       // p = 0,1,2,3,4,5,6,7,8 (maximum range)
00181       plist[j] = (py-ylo) + (pz-zlo)*3;
00182     }
00183 
00184 #pragma unroll
00185     for (int j=0;j < 4;j++) {
00186       int p = plist[j];
00187       // pbit = 0, 2, 4, 8, 16, 32, 64, 128, 256
00188       int pbit = (1 << p);
00189       if (!(occupied & pbit)) {
00190         pencil[p][pencilSize[p]++] = i;
00191         occupied |= pbit;
00192       }
00193     }
00194 
00195 breakjloop:
00196     continue;
00197   }
00198 
00199 }
00200 
00201 //
00202 // Class constructor
00203 //
00204 ComputePmeCUDAMgr::ComputePmeCUDAMgr() {
00205   __sdag_init();
00206   numDevices = 0;
00207   numTotalPatches = 0;
00208   numNodesContributed = 0;
00209   numDevicesMax = 0;
00210 }
00211 
00212 //
00213 // Class constructor
00214 //
00215 ComputePmeCUDAMgr::ComputePmeCUDAMgr(CkMigrateMessage *) {
00216   __sdag_init();
00217   NAMD_bug("ComputePmeCUDAMgr cannot be migrated");
00218   numDevices = 0;
00219   numTotalPatches = 0;
00220   numNodesContributed = 0;
00221   numDevicesMax = 0;
00222 }
00223 
00224 //
00225 // Class destructor
00226 //
00227 ComputePmeCUDAMgr::~ComputePmeCUDAMgr() {
00228   for (int i=0;i < extraDevices.size();i++) {
00229     cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
00230     cudaCheck(cudaStreamDestroy(extraDevices[i].stream));
00231   }
00232 }
00233 
00234 //
00235 // Returns home pencil (homey, homez)
00236 // Home pencil = pencil with most overlap with this patch
00237 //
00238 void ComputePmeCUDAMgr::getHomePencil(PatchID patchID, int& homey, int& homez) {
00239   PatchMap *patchMap = PatchMap::Object();
00240 
00241   BigReal miny = patchMap->min_b(patchID);
00242   BigReal maxy = patchMap->max_b(patchID);
00243 
00244   BigReal minz = patchMap->min_c(patchID);
00245   BigReal maxz = patchMap->max_c(patchID);
00246 
00247   // Determine home pencil = pencil with most overlap
00248   
00249   // Calculate patch grid coordinates
00250   int patch_y0 = floor((miny+0.5)*pmeGrid.K2);
00251   int patch_y1 = floor((maxy+0.5)*pmeGrid.K2)-1;
00252   int patch_z0 = floor((minz+0.5)*pmeGrid.K3);
00253   int patch_z1 = floor((maxz+0.5)*pmeGrid.K3)-1;
00254 
00255   if (patch_y0 < 0 || patch_y1 >= pmeGrid.K2 || patch_z0 < 0 || patch_z1 >= pmeGrid.K3) {
00256     NAMD_bug("ComputePmeCUDAMgr::getHomePencil, patch bounds are outside grid bounds");
00257   }
00258 
00259   int maxOverlap = 0;
00260   homey = -1;
00261   homez = -1;
00262   for (int iz=0;iz < pmeGrid.zBlocks;iz++) {
00263     for (int iy=0;iy < pmeGrid.yBlocks;iy++) {
00264       int pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1;
00265       getPencilDim(pmeGrid, Perm_X_Y_Z, iy, iz,
00266         pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1);
00267 
00268       if (pencil_y1 - pencil_y0 < pmeGrid.order || pencil_z1 - pencil_z0 < pmeGrid.order)
00269         NAMD_bug("ComputePmeCUDAMgr::getHomePencil, pencil size must be >= PMEInterpOrder");
00270 
00271       int y0 = std::max(patch_y0, pencil_y0);
00272       int y1 = std::min(patch_y1, pencil_y1);
00273       int z0 = std::max(patch_z0, pencil_z0);
00274       int z1 = std::min(patch_z1, pencil_z1);
00275 
00276       int overlap = (y1-y0 > 0 && z1-z0 > 0) ? (y1-y0)*(z1-z0) : -1;
00277 
00278       if (overlap > maxOverlap) {
00279         maxOverlap = overlap;
00280         homey = iy;
00281         homez = iz;
00282       }
00283     }
00284   }
00285 
00286   if (homey == -1 || homez == -1)
00287     NAMD_bug("ComputePmeCUDAMgr::getHomePencil, home pencil not found");
00288 }
00289 
00290 //
00291 // Calculates maximum number of PME pencils
00292 //
00293 void ComputePmeCUDAMgr::restrictToMaxPMEPencils() {
00294   PatchMap *patchMap = PatchMap::Object();
00295   SimParameters *simParams = Node::Object()->simParameters;
00296   Lattice lattice = simParams->lattice;
00297   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
00298   BigReal sysdimc = lattice.c_r().unit() * lattice.c();
00299   BigReal cutoff = simParams->cutoff;
00300   BigReal patchdim = simParams->patchDimension;
00301   BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
00302   BigReal marginc = 0.5 * ( patchdim - cutoff ) / sysdimc;
00303   int numPatches = patchMap->numPatches();
00304 
00305   pmeGrid.xBlocks = std::min(pmeGrid.xBlocks, pmeGrid.K1);
00306 
00307   int pid = 0;
00308   while (pid < numPatches) {
00309     // Get home pencil
00310     int homey, homez;
00311     getHomePencil(pid, homey, homez);
00312     // Y
00313     {
00314       BigReal miny = patchMap->min_b(pid);
00315       BigReal maxy = patchMap->max_b(pid);
00316       // min2 (max2) is smallest (largest) grid line for this patch
00317       int min2 = ((int) floor(pmeGrid.K2 * (miny+0.5 - marginb)));
00318       int max2 = ((int) floor(pmeGrid.K2 * (maxy+0.5 + marginb))) + (pmeGrid.order - 1);
00319       // Restrict grid lines to [0 ... pmeGrid.K2-1]
00320       if (min2 < 0) min2 += pmeGrid.K2;
00321       if (max2 >= pmeGrid.K2) max2 -= pmeGrid.K2;
00322       // Get pencil indices for the grid lines
00323       int min2pi = getPencilIndexY(pmeGrid, min2);
00324       int max2pi = getPencilIndexY(pmeGrid, max2);
00325       // Distance from home pencil
00326       int dmin2pi = homey - min2pi;
00327       if (dmin2pi < 0) dmin2pi += pmeGrid.yBlocks;
00328       if (dmin2pi < 0)
00329         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin2pi");
00330       // If distance is > 1, must decrease the number of y-pencils and try again
00331       if (dmin2pi > 1) {
00332         pmeGrid.yBlocks--;
00333         if (pmeGrid.yBlocks <= 0) break;
00334         continue;
00335       }
00336       int dmax2pi = max2pi - homey;
00337       if (dmax2pi < 0) dmax2pi += pmeGrid.yBlocks;
00338       if (dmax2pi < 0)
00339         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax2pi");
00340       // If distance is > 1, must decrease the number of y-pencils and try again
00341       if (dmax2pi > 1) {
00342         pmeGrid.yBlocks--;
00343         if (pmeGrid.yBlocks <= 0) break;
00344         continue;
00345       }
00346     }
00347 
00348     // Z
00349     {
00350       BigReal minz = patchMap->min_c(pid);
00351       BigReal maxz = patchMap->max_c(pid);
00352       // min3 (max3) is smallest (largest) grid line for this patch
00353       int min3 = ((int) floor(pmeGrid.K3 * (minz+0.5 - marginc)));
00354       int max3 = ((int) floor(pmeGrid.K3 * (maxz+0.5 + marginc))) + (pmeGrid.order - 1);
00355       // Restrict grid lines to [0 ... pmeGrid.K3-1]
00356       if (min3 < 0) min3 += pmeGrid.K3;
00357       if (max3 >= pmeGrid.K3) max3 -= pmeGrid.K3;
00358       // Get pencil indices for the grid lines
00359       int min3pi = getPencilIndexZ(pmeGrid, min3);
00360       int max3pi = getPencilIndexZ(pmeGrid, max3);
00361       // Distance from home pencil
00362       int dmin3pi = homez - min3pi;
00363       if (dmin3pi < 0) dmin3pi += pmeGrid.zBlocks;
00364       if (dmin3pi < 0)
00365         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin3pi");
00366       // If distance is > 1, must decrease the number of z-pencils and try again
00367       if (dmin3pi > 1) {
00368         pmeGrid.zBlocks--;
00369         if (pmeGrid.zBlocks <= 0) break;
00370         continue;
00371       }
00372       int dmax3pi = max3pi - homez;
00373       if (dmax3pi < 0) dmax3pi += pmeGrid.zBlocks;
00374       if (dmax3pi < 0)
00375         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax3pi");
00376       // If distance is > 1, must decrease the number of z-pencils and try again
00377       if (dmax3pi > 1) {
00378         pmeGrid.zBlocks--;
00379         if (pmeGrid.zBlocks <= 0) break;
00380         continue;
00381       }
00382     }
00383 
00384     pid++;
00385   }
00386 
00387   // if (CkMyNode() == 0)
00388   //   fprintf(stderr, "pmeGrid.yBlocks %d pmeGrid.zBlocks %d\n", pmeGrid.yBlocks, pmeGrid.zBlocks);
00389 
00390   if (pmeGrid.xBlocks <= 0 || pmeGrid.yBlocks <= 0 || pmeGrid.zBlocks <= 0)
00391     NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, zero PME pencils found");
00392 
00393   if (pmeGrid.xBlocks > pmeGrid.K1 || pmeGrid.yBlocks > pmeGrid.K2|| pmeGrid.zBlocks > pmeGrid.K3)
00394     NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, unable to restrict number of PME pencils");
00395 }
00396 
00397 //
00398 // Sets up pencils. May be called multiple times. 
00399 //
00400 void ComputePmeCUDAMgr::setupPencils() {
00401   SimParameters *simParams = Node::Object()->simParameters;
00402 
00403   pmeGrid.K1 = simParams->PMEGridSizeX;
00404   pmeGrid.K2 = simParams->PMEGridSizeY;
00405   pmeGrid.K3 = simParams->PMEGridSizeZ;
00406   pmeGrid.order = simParams->PMEInterpOrder;
00407   pmeGrid.dim2 = pmeGrid.K2;
00408   pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
00409 
00410   // Count the total number of devices assuming all nodes have the same number as this node
00411   // NOTE: This should be changed in the future to support heterogeneous nodes!!!
00412   int numDevicesTmp = deviceCUDA->getNumDevice();
00413 
00414   int numDeviceTot = CkNumNodes() * numDevicesTmp;
00415   // Use approximately 1/4th of the devices for PME
00416   int numDeviceToUse = std::max(1, numDeviceTot/4);
00417 
00418   if (numDeviceToUse < 4) {
00419     // 2D Slab
00420     pmeGrid.yBlocks = 1;
00421     pmeGrid.xBlocks = pmeGrid.zBlocks = numDeviceToUse;
00422   } else {
00423     // 1D Pencil
00424     pmeGrid.yBlocks = (int)sqrt((double)numDeviceToUse);
00425     pmeGrid.zBlocks = numDeviceToUse/pmeGrid.yBlocks;
00426     pmeGrid.xBlocks = pmeGrid.zBlocks;
00427   }
00428 
00429   if ( simParams->PMEPencilsX > 0 ) pmeGrid.xBlocks = simParams->PMEPencilsX;
00430   if ( simParams->PMEPencilsY > 0 ) pmeGrid.yBlocks = simParams->PMEPencilsY;
00431   if ( simParams->PMEPencilsZ > 0 ) pmeGrid.zBlocks = simParams->PMEPencilsZ;
00432 
00433   // Restrict number of pencils to the maximum number
00434   restrictToMaxPMEPencils();
00435 
00436   // Fix pencil numbers if they don't make sense w.r.t. number of devices
00437   if (pmeGrid.yBlocks == 1) {
00438     // 2D Slab
00439     if (pmeGrid.xBlocks > numDeviceTot) pmeGrid.xBlocks = numDeviceTot;
00440     if (pmeGrid.zBlocks > numDeviceTot) pmeGrid.zBlocks = numDeviceTot;
00441   } else {
00442     // 1D Pencil
00443     if (pmeGrid.yBlocks*pmeGrid.zBlocks > numDeviceTot ||
00444       pmeGrid.xBlocks*pmeGrid.zBlocks > numDeviceTot ||
00445       pmeGrid.xBlocks*pmeGrid.yBlocks > numDeviceTot) {
00446       pmeGrid.yBlocks = std::min(pmeGrid.yBlocks, (int)sqrt((double)numDeviceTot));
00447       pmeGrid.zBlocks = std::min(pmeGrid.zBlocks, numDeviceTot/pmeGrid.yBlocks);
00448     }
00449     pmeGrid.xBlocks = std::min(pmeGrid.yBlocks, pmeGrid.zBlocks);
00450   }
00451 
00452   // Here (block1, block2, block3) define the size of charge grid pencil in each direction
00453   pmeGrid.block1 = ( pmeGrid.K1 + pmeGrid.xBlocks - 1 ) / pmeGrid.xBlocks;
00454   pmeGrid.block2 = ( pmeGrid.K2 + pmeGrid.yBlocks - 1 ) / pmeGrid.yBlocks;
00455   pmeGrid.block3 = ( pmeGrid.K3 + pmeGrid.zBlocks - 1 ) / pmeGrid.zBlocks;
00456 
00457   // Determine type of FFT
00458   if (pmeGrid.xBlocks == 1 && pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1) {
00459     // Single block => 3D FFT
00460     pmePencilType = 3;
00461   } else if (pmeGrid.yBlocks == 1) {
00462     // Blocks in all but y-dimension => 2D FFT
00463     pmePencilType = 2;
00464   } else {
00465     // Blocks in all dimensions => 1D FFT
00466     pmePencilType = 1;
00467   }
00468 
00469   //--------------------------------------------------------------------------
00470   // Map pencils into Pes
00471   xPes.resize(pmeGrid.yBlocks*pmeGrid.zBlocks);
00472 
00473   if (pmePencilType == 1 || pmePencilType == 2) {
00474     zPes.resize(pmeGrid.xBlocks*pmeGrid.yBlocks);
00475   }
00476   if (pmePencilType == 1) {
00477     yPes.resize(pmeGrid.xBlocks*pmeGrid.zBlocks);
00478   }  
00479 
00480     // i % numDeviceTot                              = device index
00481     // (i % numDeviceTot)/deviceCUDA->getNumDevice() = node index
00482     // (i % CkNodeSize(node))                        = pe displacement
00483   for (int i=0;i < xPes.size();i++) {
00484     int node = (i % numDeviceTot)/numDevicesTmp;
00485     xPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
00486   }
00487   for (int i=0;i < yPes.size();i++) {
00488     int node = (i % numDeviceTot)/numDevicesTmp;
00489     yPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
00490   }
00491   for (int i=0;i < zPes.size();i++) {
00492     int node = (i % numDeviceTot)/numDevicesTmp;
00493     zPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
00494   }
00495 
00496   // char peStr[256];
00497   // char *p = peStr;
00498   // p += sprintf(p, "%2d | xPes", CkMyPe());
00499   // for (int i=0;i < xPes.size();i++)
00500   //   p += sprintf(p, " %d", xPes[i]);
00501   // p += sprintf(p, " yPes");
00502   // for (int i=0;i < yPes.size();i++)
00503   //   p += sprintf(p, " %d", yPes[i]);
00504   // p += sprintf(p, " zPes");
00505   // for (int i=0;i < zPes.size();i++)
00506   //   p += sprintf(p, " %d", zPes[i]);
00507   // fprintf(stderr, "%s | %d %d\n",peStr, CkNodeFirst(CkMyNode()), CkNodeSize(CkMyNode()));
00508 
00509   //--------------------------------------------------------------------------
00510   // Build global node list for x-pencils
00511   nodeDeviceList.resize(xPes.size());
00512   numDevices = 0;
00513   for (int k=0;k < xPes.size();k++) {
00514     nodeDeviceList[k].node = CkNodeOf(xPes[k]);
00515     nodeDeviceList[k].device = -1;
00516     if (nodeDeviceList[k].node == CkMyNode()) {
00517       nodeDeviceList[k].device = numDevices++;
00518     }
00519   }
00520 
00521   ijPencilX.clear();
00522   ijPencilY.clear();
00523   ijPencilZ.clear();
00524 
00525   // Construct list of pencil coordinates (i,j) for each device held by this node
00526   for (int k=0;k < xPes.size();k++) {
00527     if (CkMyNode() == CkNodeOf(xPes[k])) {
00528       IJ ij;
00529       ij.i = k % pmeGrid.yBlocks;
00530       ij.j = k / pmeGrid.yBlocks;
00531       ijPencilX.push_back(ij);
00532     }
00533   }
00534   if (ijPencilX.size() != numDevices)
00535     NAMD_bug("ComputePmeCUDAMgr::setupPencils, error setting up x-pencils and devices");
00536 
00537   int numDevicesY = 0;
00538   for (int k=0;k < yPes.size();k++) {
00539     if (CkMyNode() == CkNodeOf(yPes[k])) {
00540       IJ ij;
00541       ij.i = k % pmeGrid.xBlocks;
00542       ij.j = k / pmeGrid.xBlocks;
00543       ijPencilY.push_back(ij);
00544       numDevicesY++;
00545     }
00546   }
00547 
00548   int numDevicesZ = 0;
00549   for (int k=0;k < zPes.size();k++) {
00550     if (CkMyNode() == CkNodeOf(zPes[k])) {
00551       IJ ij;
00552       ij.i = k % pmeGrid.xBlocks;
00553       ij.j = k / pmeGrid.xBlocks;
00554       ijPencilZ.push_back(ij);
00555       numDevicesZ++;
00556     }
00557   }  
00558 }
00559 
00560 //
00561 // Returns true if PE "pe" is used in PME
00562 //
00563 bool ComputePmeCUDAMgr::isPmePe(int pe) {
00564   for (int i=0;i < xPes.size();i++) {
00565     if (pe == xPes[i]) return true;
00566   }
00567   return false;
00568 }
00569 
00570 //
00571 // Returns true if node "node" is used for PME
00572 //
00573 bool ComputePmeCUDAMgr::isPmeNode(int node) {
00574   for (int i=0;i < nodeDeviceList.size();i++) {
00575     if (nodeDeviceList[i].node == node) {
00576       return true;
00577     }
00578   }
00579   return false;
00580 }
00581 
00582 //
00583 // Returns true if device "deviceID" is used for PME
00584 //
00585 bool ComputePmeCUDAMgr::isPmeDevice(int deviceID) {
00586   for (int i=0;i < nodeDeviceList.size();i++) {
00587     if (deviceCUDA->getDeviceIDbyRank(nodeDeviceList[i].device % deviceCUDA->getNumDevice()) == deviceID) {
00588       return true;
00589     }
00590   }
00591   return false;
00592 }
00593 
00594 //
00595 // Initialize compute manager
00596 // This gets called on one Pe on each node from Node::startup()
00597 //
00598 void ComputePmeCUDAMgr::initialize(CkQdMsg *msg) {
00599         if (msg != NULL) delete msg;
00600 
00601   setupPencils();
00602 
00603   if ( ! CkMyNode() ) {
00604     iout << iINFO << "PME using " << pmeGrid.xBlocks << " x " <<
00605       pmeGrid.yBlocks << " x " << pmeGrid.zBlocks <<
00606       " pencil grid for FFT and reciprocal sum.\n" << endi;
00607   }
00608 
00609   // Initialize list that contains the number of home patches for each device on this manager
00610   numHomePatchesList.resize(numDevices, 0);
00611 
00612   //--------------------------------------------------------------------------
00613   // Create devices and atom filer
00614   // numDevices = number of devices we'll be using, possibly different on each node
00615   // Use root node to compute the maximum number of devices in use over all nodes
00616   thisProxy[0].initializeDevicesAndAtomFiler(new NumDevicesMsg(numDevices));
00617   //--------------------------------------------------------------------------
00618 
00619   if (CkMyNode() == 0) {
00620 
00621     if (pmePencilType == 3) {
00622                 // Single block => 3D FFT
00623       CProxy_PmePencilXYZMap xyzMap = CProxy_PmePencilXYZMap::ckNew(xPes[0]);
00624       CkArrayOptions xyzOpts(1);
00625       xyzOpts.setMap(xyzMap);
00626       xyzOpts.setAnytimeMigration(false);
00627       xyzOpts.setStaticInsertion(true);
00628       pmePencilXYZ = CProxy_CudaPmePencilXYZ::ckNew(xyzOpts);
00629       pmePencilXYZ[0].initialize(new CudaPmeXYZInitMsg(pmeGrid));
00630       thisProxy.recvPencils(pmePencilXYZ);
00631     } else if (pmePencilType == 2) {
00632       // Blocks in all but y-dimension => 2D FFT
00633       CProxy_PmePencilXYMap xyMap = CProxy_PmePencilXYMap::ckNew(xPes);
00634       CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
00635       CkArrayOptions xyOpts(1, 1, pmeGrid.zBlocks);
00636       CkArrayOptions zOpts(pmeGrid.xBlocks, 1, 1);
00637       xyOpts.setMap(xyMap);
00638       zOpts.setMap(zMap);
00639       xyOpts.setAnytimeMigration(false);
00640       zOpts.setAnytimeMigration(false);
00641       xyOpts.setStaticInsertion(true);
00642       zOpts.setStaticInsertion(true);
00643       pmePencilXY = CProxy_CudaPmePencilXY::ckNew(xyOpts);
00644       pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
00645       // Send pencil proxies to other nodes
00646       thisProxy.recvPencils(pmePencilXY, pmePencilZ);
00647       pmePencilXY.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
00648       pmePencilZ.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
00649     } else {
00650                 // Blocks in all dimensions => 1D FFT
00651       CProxy_PmePencilXMap xMap = CProxy_PmePencilXMap::ckNew(1, 2, pmeGrid.yBlocks, xPes);
00652       CProxy_PmePencilXMap yMap = CProxy_PmePencilXMap::ckNew(0, 2, pmeGrid.xBlocks, yPes);
00653       CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
00654       CkArrayOptions xOpts(1, pmeGrid.yBlocks, pmeGrid.zBlocks);
00655       CkArrayOptions yOpts(pmeGrid.xBlocks, 1, pmeGrid.zBlocks);
00656                 CkArrayOptions zOpts(pmeGrid.xBlocks, pmeGrid.yBlocks, 1);
00657       xOpts.setMap(xMap);
00658       yOpts.setMap(yMap);
00659       zOpts.setMap(zMap);
00660       xOpts.setAnytimeMigration(false);
00661       yOpts.setAnytimeMigration(false);
00662       zOpts.setAnytimeMigration(false);
00663       xOpts.setStaticInsertion(true);
00664       yOpts.setStaticInsertion(true);
00665       zOpts.setStaticInsertion(true);
00666       pmePencilX = CProxy_CudaPmePencilX::ckNew(xOpts);
00667       pmePencilY = CProxy_CudaPmePencilY::ckNew(yOpts);
00668       pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
00669       // Send pencil proxies to other nodes
00670       thisProxy.recvPencils(pmePencilX, pmePencilY, pmePencilZ);
00671       pmePencilX.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
00672       pmePencilY.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
00673       pmePencilZ.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
00674     }
00675   }
00676 
00677 }
00678 
00679 void ComputePmeCUDAMgr::createDevicesAndAtomFiler() {
00680   if (CkMyNode() != 0)
00681     NAMD_bug("ComputePmeCUDAMgr::createDevicesAndAtomFiler can only be called on root node");
00682 
00683   // Root node creates all device proxies
00684   // NOTE: Only root node has numDevicesMax
00685   RecvDeviceMsg* msg = new (numDevicesMax, PRIORITY_SIZE) RecvDeviceMsg();
00686   msg->numDevicesMax = numDevicesMax;
00687   for (int i=0;i < numDevicesMax;i++) {
00688     CProxy_ComputePmeCUDADevice dev = CProxy_ComputePmeCUDADevice::ckNew();
00689     memcpy(&msg->dev[i], &dev, sizeof(CProxy_ComputePmeCUDADevice));
00690   }
00691   thisProxy.recvDevices(msg);
00692 
00693   CProxy_PmeAtomFiler filer = CProxy_PmeAtomFiler::ckNew();
00694   thisProxy.recvAtomFiler(filer);
00695 
00696 }
00697 
00698 void ComputePmeCUDAMgr::recvAtomFiler(CProxy_PmeAtomFiler filer) {
00699   pmeAtomFiler = filer;
00700 }
00701 
00702 void ComputePmeCUDAMgr::recvDevices(RecvDeviceMsg* msg) {
00703   numDevicesMax = msg->numDevicesMax;
00704   if (numDevices > numDevicesMax)
00705     NAMD_bug("ComputePmeCUDAMgr::recvDevices, numDevices > numDevicesMax");
00706   deviceProxy.resize(numDevices);
00707   for (int i=0;i < numDevices;i++) {
00708     deviceProxy[i] = msg->dev[i];
00709   }
00710   delete msg;
00711 }
00712 
00713 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXYZ xyz) {
00714   pmePencilXYZ = xyz;
00715 }
00716 
00717 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXY xy, CProxy_CudaPmePencilZ z) {
00718   pmePencilXY = xy;
00719   pmePencilZ = z;
00720 }
00721 
00722 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilX x, CProxy_CudaPmePencilY y, CProxy_CudaPmePencilZ z) {
00723   pmePencilX = x;
00724   pmePencilY = y;
00725   pmePencilZ = z;
00726 }
00727 
00728 //
00729 // Initialize pencils on this node
00730 // This gets called on one rank on each node
00731 //
00732 void ComputePmeCUDAMgr::initialize_pencils(CkQdMsg *msg) {
00733   if (msg != NULL) delete msg;
00734 
00735   int numDevicesTmp = deviceCUDA->getNumDevice();
00736 
00737   // Initialize device proxies for real-space interfacing
00738   for (int i=0;i < ijPencilX.size();i++) {
00739     // NOTE: i is here the device ID
00740     int deviceID = deviceCUDA->getDeviceIDbyRank(i % numDevicesTmp);
00741     deviceProxy[i].ckLocalBranch()->initialize(pmeGrid, ijPencilX[i].i, ijPencilX[i].j,
00742       deviceID, pmePencilType, thisProxy, pmeAtomFiler);
00743     if (pmePencilType == 1) {
00744       deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilX);
00745     } else if (pmePencilType == 2) {
00746       deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXY);
00747     } else {
00748       deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXYZ);
00749     }
00750   }
00751 
00752   // Use above initialized device proxies for the PME pencils that interface with real-space
00753   for (int i=0;i < ijPencilX.size();i++) {
00754     if (pmePencilType == 1) {
00755       pmePencilX(0, ijPencilX[i].i, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
00756     } else if (pmePencilType == 2) {
00757       pmePencilXY(0, 0, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
00758     } else {
00759       pmePencilXYZ[0].initializeDevice(new InitDeviceMsg(deviceProxy[i]));
00760     }
00761   }
00762 
00763   // Create extra devices for Y and Z pencils if necessary
00764   int n = std::max(ijPencilY.size(), ijPencilZ.size());
00765   if (n > ijPencilX.size()) {
00766     int nextra = n - ijPencilX.size();
00767     extraDevices.resize(nextra);
00768     for (int i=0;i < nextra;i++) {
00769       extraDevices[i].deviceID = (i + ijPencilX.size())  % numDevicesTmp;
00770       cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
00771       createStream(extraDevices[i].stream);
00772     }
00773   }
00774 
00775   // Initialize Y pencils
00776   for (int i=0;i < ijPencilY.size();i++) {
00777     int deviceID;
00778     cudaStream_t stream;
00779     if (i < ijPencilX.size()) {
00780       deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
00781       stream   = deviceProxy[i].ckLocalBranch()->getStream();
00782     } else {
00783       deviceID = extraDevices[i-ijPencilX.size()].deviceID;
00784       stream   = extraDevices[i-ijPencilX.size()].stream;
00785     }
00786     pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy));
00787   }
00788 
00789   // Initialize Z pencils
00790   for (int i=0;i < ijPencilZ.size();i++) {
00791     int deviceID;
00792     cudaStream_t stream;
00793     if (i < ijPencilX.size()) {
00794       deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
00795       stream   = deviceProxy[i].ckLocalBranch()->getStream();
00796     } else {
00797       deviceID = extraDevices[i-ijPencilX.size()].deviceID;
00798       stream   = extraDevices[i-ijPencilX.size()].stream;
00799     }
00800     pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy));
00801   }
00802 
00803 }
00804 
00805 //
00806 // Activate (start) pencils
00807 // This gets called on rank 0 Pe on each node
00808 //
00809 void ComputePmeCUDAMgr::activate_pencils(CkQdMsg *msg) {
00810   if (msg != NULL) delete msg;
00811 
00812   for (int device=0;device < numDevices;device++) {
00813     deviceProxy[device].ckLocalBranch()->activate_pencils();
00814   }
00815 
00816   for (int i=0;i < ijPencilY.size();i++) {
00817     PmeStartMsg* pmeStartYMsg = new PmeStartMsg();
00818     pmeStartYMsg->data = NULL;
00819     pmeStartYMsg->dataSize = 0;
00820     pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).start(pmeStartYMsg);
00821   }
00822 
00823   for (int i=0;i < ijPencilZ.size();i++) {
00824     PmeStartMsg* pmeStartZMsg = new PmeStartMsg();
00825     pmeStartZMsg->data = NULL;
00826     pmeStartZMsg->dataSize = 0;
00827     pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).start(pmeStartZMsg);
00828   }
00829 
00830 }
00831 
00832 //
00833 // Returns node that contains x-pencil i,j
00834 //
00835 int ComputePmeCUDAMgr::getNode(int i, int j) {
00836   if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
00837     NAMD_bug("ComputePmeCUDAMgr::getNode, pencil index out of bounds");
00838   int ind = i + j*pmeGrid.yBlocks;
00839   return nodeDeviceList[ind].node;
00840 }
00841 
00842 //
00843 // Returns home node for a patch
00844 //
00845 int ComputePmeCUDAMgr::getHomeNode(PatchID patchID) {
00846   int homey, homez;
00847   getHomePencil(patchID, homey, homez);
00848   return getNode(homey, homez);
00849 }
00850 
00851 //
00852 // Returns device index on this node that contains x-pencil i,j
00853 //
00854 int ComputePmeCUDAMgr::getDevice(int i, int j) {
00855   if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
00856     NAMD_bug("ComputePmeCUDAMgr::getDevice, pencil index out of bounds");
00857   int ind = i + j*pmeGrid.yBlocks;
00858   int device = nodeDeviceList[ind].device;
00859   if (device == -1)
00860     NAMD_bug("ComputePmeCUDAMgr::getDevice, no device found");
00861   return device;
00862 }
00863 
00864 //
00865 // Returns device index on this node that contains y-pencil i,j
00866 //
00867 int ComputePmeCUDAMgr::getDevicePencilY(int i, int j) {
00868   if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.zBlocks)
00869     NAMD_bug("ComputePmeCUDAMgr::getDevicePencilY, pencil index out of bounds");
00870   for (int device=0;device < ijPencilY.size();device++) {
00871     if (ijPencilY[device].i == i && ijPencilY[device].j == j) return device;
00872   }
00873   char str[256];
00874   sprintf(str, "ComputePmeCUDAMgr::getDevicePencilY, no device found at i %d j %d",i,j);
00875   NAMD_bug(str);
00876   return -1;
00877 }
00878 
00879 //
00880 // Returns device index on this node that contains z-pencil i,j
00881 //
00882 int ComputePmeCUDAMgr::getDevicePencilZ(int i, int j) {
00883   if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.yBlocks)
00884     NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, pencil index out of bounds");
00885   for (int device=0;device < ijPencilZ.size();device++) {
00886     if (ijPencilZ[device].i == i && ijPencilZ[device].j == j) return device;
00887   }
00888   NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, no device found");
00889   return -1;
00890 }
00891 
00892 //
00893 // Returns device ID on this node that contains x-pencil i,j
00894 //
00895 int ComputePmeCUDAMgr::getDeviceIDPencilX(int i, int j) {
00896   int device = getDevice(i, j);
00897   return deviceProxy[device].ckLocalBranch()->getDeviceID();
00898 }
00899 
00900 //
00901 // Returns device ID on this node that contains y-pencil i,j
00902 //
00903 int ComputePmeCUDAMgr::getDeviceIDPencilY(int i, int j) {
00904   int device = getDevicePencilY(i, j);
00905   return deviceProxy[device].ckLocalBranch()->getDeviceID();
00906 }
00907 
00908 //
00909 // Returns device ID on this node that contains z-pencil i,j
00910 //
00911 int ComputePmeCUDAMgr::getDeviceIDPencilZ(int i, int j) {
00912   int device = getDevicePencilZ(i, j);
00913   return deviceProxy[device].ckLocalBranch()->getDeviceID();
00914 }
00915 
00916 //
00917 // Skip this round of PME, call skip on all Z-pencils (this is needed to get the reductions submitted)
00918 //
00919 void ComputePmeCUDAMgr::skip() {
00920   switch(pmePencilType) {
00921     case 1:
00922     pmePencilZ.skip();
00923     break;
00924     case 2:
00925     pmePencilZ.skip();
00926     break;
00927     case 3:
00928     pmePencilXYZ[0].skip();
00929     break;
00930   }
00931 }
00932 
00933 void ComputePmeCUDAMgr::recvAtoms(PmeAtomMsg *msg) {
00934   int device = getDevice(msg->i, msg->j);
00935   deviceProxy[device].ckLocalBranch()->recvAtoms(msg);
00936 }
00937 
00938 ComputePmeCUDADevice::ComputePmeCUDADevice() {
00939   // __sdag_init();
00940   numHomePatches = 0;
00941   forceCapacity = 0;
00942   force = NULL;
00943   pmeRealSpaceCompute = NULL;
00944   streamCreated = false;
00945   lock_numHomePatchesMerged = CmiCreateLock();
00946   lock_numPencils = CmiCreateLock();
00947   lock_numNeighborsRecv = CmiCreateLock();
00948   lock_recvAtoms = CmiCreateLock();
00949   numNeighborsExpected = 0;
00950   numStrayAtoms = 0;
00951   // Reset counters
00952   numNeighborsRecv = 0;
00953   numHomePatchesRecv = 0;
00954   numHomePatchesMerged = 0;
00955   atomI = 0;
00956   forceI = 1;
00957 }
00958 
00959 ComputePmeCUDADevice::ComputePmeCUDADevice(CkMigrateMessage *m) {
00960   // __sdag_init();
00961   numHomePatches = 0;
00962   forceCapacity = 0;
00963   force = NULL;
00964   pmeRealSpaceCompute = NULL;
00965   streamCreated = false;
00966   lock_numHomePatchesMerged = CmiCreateLock();
00967   lock_numPencils = CmiCreateLock();
00968   lock_numNeighborsRecv = CmiCreateLock();
00969   lock_recvAtoms = CmiCreateLock();
00970   numNeighborsExpected = 0;
00971   numStrayAtoms = 0;
00972   // Reset counters
00973   numNeighborsRecv = 0;
00974   numHomePatchesRecv = 0;
00975   numHomePatchesMerged = 0;
00976   atomI = 0;
00977   forceI = 1;
00978 }
00979 
00980 ComputePmeCUDADevice::~ComputePmeCUDADevice() {
00981   if (streamCreated) {
00982     cudaCheck(cudaSetDevice(deviceID));
00983     cudaCheck(cudaStreamDestroy(stream));
00984   }
00985   for (int j=0;j < 2;j++)
00986     for (int i=0;i < pmeAtomStorage[j].size();i++) {
00987       if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
00988     }
00989   if (force != NULL) deallocate_host<CudaForce>(&force);
00990   if (pmeRealSpaceCompute != NULL) delete pmeRealSpaceCompute;
00991   CmiDestroyLock(lock_numHomePatchesMerged);
00992   CmiDestroyLock(lock_numPencils);
00993   CmiDestroyLock(lock_numNeighborsRecv);
00994   CmiDestroyLock(lock_recvAtoms);
00995 }
00996 
00997 void ComputePmeCUDADevice::initialize(PmeGrid& pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in,
00998   int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
00999   CProxy_PmeAtomFiler pmeAtomFiler_in) {
01000 
01001 #define CUDA_EVENT_SPREADCHARGE 90
01002   traceRegisterUserEvent("CUDA spreadCharge", CUDA_EVENT_SPREADCHARGE);
01003 #define CUDA_EVENT_GATHERFORCE 91
01004   traceRegisterUserEvent("CUDA gatherForce", CUDA_EVENT_GATHERFORCE);
01005 
01006   deviceID = deviceID_in;
01007   cudaCheck(cudaSetDevice(deviceID));
01008   pmePencilType = pmePencilType_in;
01009   pmeGrid = pmeGrid_in;
01010   pencilIndexY = pencilIndexY_in;
01011   pencilIndexZ = pencilIndexZ_in;
01012   mgrProxy = mgrProxy_in;
01013   pmeAtomFiler = pmeAtomFiler_in;
01014   // Size of the neighboring pencil grid, max 3x3
01015   yNBlocks = std::min(pmeGrid.yBlocks, 3);
01016   zNBlocks = std::min(pmeGrid.zBlocks, 3);
01017   // Local pencil is at y=0,z=0
01018   if (yNBlocks == 1) {
01019     ylo = 0;
01020     yhi = 0;
01021   } else if (yNBlocks == 2) {
01022     ylo = -1;
01023     yhi = 0;
01024   } else {
01025     ylo = -1;
01026     yhi = 1;
01027   }
01028   if (zNBlocks == 1) {
01029     zlo = 0;
01030     zhi = 0;
01031   } else if (zNBlocks == 2) {
01032     zlo = -1;
01033     zhi = 0;
01034   } else {
01035     zlo = -1;
01036     zhi = 1;
01037   }
01038   
01039   neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
01040   // neighborForcePencils.resize(yNBlocks*zNBlocks);
01041   for (int j=0;j < 2;j++)
01042     homePatchIndexList[j].resize(yNBlocks*zNBlocks);
01043   neighborPatchIndex.resize(yNBlocks*zNBlocks);
01044 
01045   pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
01046   for (int j=0;j < 2;j++) {
01047     pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
01048     for (int z=zlo;z <= zhi;z++) {
01049       for (int y=ylo;y <= yhi;y++) {
01050         int pp = y-ylo + (z-zlo)*yNBlocks;
01051         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01052         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01053         if (y == 0 && z == 0) {
01054           // Primary pencil
01055           pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
01056         } else {
01057           pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
01058         }
01059         pmeAtomStorageAllocatedHere[pp] = true;
01060       }
01061     }
01062   }
01063 
01064   // Create stream for this device
01065   createStream(stream);
01066   streamCreated = true;
01067   pmeRealSpaceCompute = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ,
01068     deviceID, stream);
01069 
01070 }
01071 
01072 cudaStream_t ComputePmeCUDADevice::getStream() {
01073   return stream;
01074 }
01075 
01076 int ComputePmeCUDADevice::getDeviceID() {
01077   return deviceID;
01078 }
01079 
01080 CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy() {
01081   return mgrProxy;
01082 }
01083 
01084 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in) {
01085   if (pmePencilType != 3)
01086     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
01087   pmePencilXYZ = pmePencilXYZ_in;
01088 }
01089 
01090 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXY pmePencilXY_in) {
01091   if (pmePencilType != 2)
01092     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
01093   pmePencilXY = pmePencilXY_in;
01094 }
01095 
01096 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilX pmePencilX_in) {
01097   if (pmePencilType != 1)
01098     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
01099   pmePencilX = pmePencilX_in;
01100 }
01101 
01102 void ComputePmeCUDADevice::activate_pencils() {
01103   if (pmePencilType == 1) {
01104     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01105     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01106     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01107     pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
01108   } else if (pmePencilType == 2) {
01109     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01110     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01111     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01112     pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
01113   } else if (pmePencilType == 3) {
01114     PmeStartMsg* pmeStartMsg = new PmeStartMsg();
01115     pmeStartMsg->data = pmeRealSpaceCompute->getData();
01116     pmeStartMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01117     pmePencilXYZ[0].start(pmeStartMsg);
01118   }
01119 }
01120 
01121 void ComputePmeCUDADevice::initializePatches(int numHomePatches_in) {
01122   numHomePatches = numHomePatches_in;
01123   for (int j=0;j < 2;j++)
01124     numPencils[j].resize(numHomePatches);
01125   for (int j=0;j < 2;j++)
01126     plList[j].resize(numHomePatches);
01127   for (int j=0;j < 2;j++)
01128     homePatchForceMsgs[j].resize(numHomePatches);
01129   // for (int j=0;j < 2;j++)
01130   //   numHomeAtoms[j].resize(numHomePatches);
01131   // If we have home patches, register this pencil with the neighbors and with self
01132   if (numHomePatches > 0) {
01133     for (int z=zlo;z <= zhi;z++) {
01134       for (int y=ylo;y <= yhi;y++) {
01135         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01136         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01137         int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01138         mgrProxy[node].registerNeighbor(yt, zt);
01139       }
01140     }
01141   }
01142 }
01143 
01144 void ComputePmeCUDADevice::registerNeighbor() {
01145   CmiLock(lock_numHomePatchesMerged);
01146   numNeighborsExpected++;
01147   CmiUnlock(lock_numHomePatchesMerged);
01148 }
01149 
01150 //
01151 // Recevice atoms from patch and file them into pencils
01152 //
01153 void ComputePmeCUDADevice::recvAtoms(PmeAtomMsg *msg) {
01154 
01155   PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
01156   // Store "virial" and "energy" flags
01157   doVirial = msg->doVirial;
01158   doEnergy = msg->doEnergy;
01159   // Get lattice
01160   SimParameters *simParams = Node::Object()->simParameters;
01161   Lattice lattice = simParams->lattice;
01162 
01163   // Primary pencil index
01164   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01165   int p0 = 0;
01166   int pencilPatchIndex[9];
01167   int numStrayAtomsPatch = 0;
01168   if (pmePencilType == 3) {
01169     // 3D box => store atoms directly without index
01170     // NOTE: We don't check for stray atoms here!
01171     pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01172   } else {
01173 
01174     // File atoms
01175     pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
01176       pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
01177 
01178     // Loop through pencils and add atoms to pencil atom lists
01179     // NOTE: we only store to neighboring pencil if there are atoms to store
01180     int numAtomsCheck = 0;
01181     for (int p=0;p < 9;p++) {
01182 
01183       int y = (p % 3);
01184       int z = (p / 3);
01185 
01186       int pp = y + z*yNBlocks;
01187       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01188       if (pp == pp0) p0 = p;
01189       if (pp == pp0 || numAtoms > 0) {
01190         if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
01191           NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
01192         int* index = pmeAtomFilerPtr->getAtomIndex(p);
01193         pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index);
01194         // Number of patches in this storage tells you how many home patches contributed and
01195         // homePatchIndex (pe) tells you which patch contributed
01196         numAtomsCheck += numAtoms;
01197       }
01198     }
01199 
01200     // Deal with stray atoms
01201     numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
01202     if (numStrayAtomsPatch > 0) {
01203       int* index = pmeAtomFilerPtr->getAtomIndex(9);
01204       CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
01205       for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
01206         int j = index[i];
01207         CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
01208       }
01209     }
01210 
01211     if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
01212       NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
01213   }
01214 
01215   // Create storage for home patch forces
01216   PmeForceMsg *forceMsg;
01217   if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
01218     // 3D FFT and compute resides on the same node => use zero-copy forces
01219     forceMsg = new (0, PRIORITY_SIZE) PmeForceMsg();
01220     forceMsg->zeroCopy = true;
01221   } else {
01222     forceMsg = new (msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
01223     forceMsg->zeroCopy = false;
01224   }
01225   forceMsg->numAtoms = msg->numAtoms;
01226   forceMsg->pe = msg->pe;
01227   forceMsg->compute = msg->compute;
01228   forceMsg->numStrayAtoms = numStrayAtomsPatch;
01229 
01230   bool done = false;
01231   // ----------------------------- lock start ---------------------------
01232   // Only after writing has finished, we get homePatchIndex
01233   // This quarantees that for whatever thread that receives "done=true", writing has finished on
01234   // ALL threads.
01235   CmiLock(lock_recvAtoms);
01236   numStrayAtoms += numStrayAtomsPatch;
01237   // Secure homePatchIndex. All writes after this must be inside lock-region
01238   int homePatchIndex = numHomePatchesRecv;
01239   // Store primary pencil first
01240   plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
01241   if (pmePencilType != 3) {
01242     // Go back to through neighboring pencils and store "homePatchIndex"
01243     for (int p=0;p < 9;p++) {
01244 
01245       int y = (p % 3);
01246       int z = (p / 3);
01247 
01248       int pp = y + z*yNBlocks;
01249       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01250       if (pp != pp0 && numAtoms > 0) {
01251         homePatchIndexList[atomI][pp].push_back(homePatchIndex);
01252         // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
01253         //                                sharing it
01254         // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
01255         plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
01256       }
01257     }
01258   }
01259   homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
01260   // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
01261   // Set the number of pencils contributing to this home patch
01262   numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
01263   //
01264   numHomePatchesRecv++;
01265   if (numHomePatchesRecv == numHomePatches) {
01266     // Reset counter
01267     numHomePatchesRecv = 0;
01268     done = true;
01269   }
01270   CmiUnlock(lock_recvAtoms);
01271   // ----------------------------- lock end  ---------------------------
01272 
01273   // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
01274 
01275   delete msg;
01276 
01277   if (done) {
01278     // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
01279     sendAtomsToNeighbors();
01280   }
01281 }
01282 
01283 //
01284 // Loop through pencils and send atoms to neighboring nodes
01285 //
01286 void ComputePmeCUDADevice::sendAtomsToNeighbors() {
01287   for (int z=zlo;z <= zhi;z++) {
01288     for (int y=ylo;y <= yhi;y++) {
01289       // Only send to neighbors, not self
01290       if (y != 0 || z != 0) {
01291         // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
01292         // before these sends have been performed
01293         thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
01294       }
01295     }
01296   }
01297   // Register primary pencil
01298   registerRecvAtomsFromNeighbor();
01299 }
01300 
01301 void ComputePmeCUDADevice::sendAtomsToNeighbor(int y, int z, int atomIval) {
01302   // Pencil index  
01303   int pp = y-ylo + (z-zlo)*yNBlocks;
01304   // This neighbor pencil is done, finish it up before accessing it
01305   pmeAtomStorage[atomIval][pp]->finish();
01306   // Compute destination neighbor pencil index (yt,zt)
01307   int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01308   int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01309   int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
01310   CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
01311   PmeAtomPencilMsg* msgPencil = new (numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
01312   memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
01313   msgPencil->numAtoms = numAtoms;
01314   // Store destination pencil index
01315   msgPencil->y = yt;
01316   msgPencil->z = zt;
01317   // Store source pencil index
01318   msgPencil->srcY = pencilIndexY;
01319   msgPencil->srcZ = pencilIndexZ;
01320   // Store energy and virial flags
01321   msgPencil->doEnergy = doEnergy;
01322   msgPencil->doVirial = doVirial;
01323   int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01324   mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
01325 }
01326 
01327 void ComputePmeCUDADevice::recvAtomsFromNeighbor(PmeAtomPencilMsg *msg) {
01328   // Store into primary pencil
01329   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01330   // Compute pencil index relative to primary pencil
01331   int y = msg->srcY - pencilIndexY;
01332   if (y < ylo) y += pmeGrid.yBlocks;
01333   if (y > yhi) y -= pmeGrid.yBlocks;
01334   int z = msg->srcZ - pencilIndexZ;
01335   if (z < zlo) z += pmeGrid.zBlocks;
01336   if (z > zhi) z -= pmeGrid.zBlocks;
01337   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01338     NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
01339   }
01340   // Read energy and virial flags
01341   doEnergy = msg->doEnergy;
01342   doVirial = msg->doVirial;
01343   // Pencil index where atoms came from
01344   int pp = y-ylo + (z-zlo)*yNBlocks;
01345   // Store atoms and mark down the patch index where these atoms were added
01346   neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01347 
01348   delete msg;
01349 
01350   registerRecvAtomsFromNeighbor();
01351 }
01352 
01353 void ComputePmeCUDADevice::registerRecvAtomsFromNeighbor() {
01354   // Primary pencil
01355   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01356 
01357   bool done = false;
01358   // ----------------------------- lock start ---------------------------
01359   CmiLock(lock_numNeighborsRecv);
01360   numNeighborsRecv++;
01361   if (numNeighborsRecv == numNeighborsExpected) {
01362     // Reset counter
01363     numNeighborsRecv = 0;
01364     done = true;
01365   }
01366   CmiUnlock(lock_numNeighborsRecv);
01367   // ----------------------------- lock end  ---------------------------
01368 
01369   if (done) {
01370     // Primary pencil has received all atoms and writing has finished => spread charge
01371     spreadCharge();
01372   }  
01373 }
01374 
01375 void ComputePmeCUDADevice::spreadCharge() {
01376   // Spread charges in primary pencil
01377   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01378   // Primary pencil is done, finish it up before accessing it
01379   // (clearing is done in mergeForcesOnPatch)
01380   pmeAtomStorage[atomI][pp0]->finish();
01381   // Get the number of atoms and pointer to atoms
01382   int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
01383   CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
01384   // Flip atomI <-> forceI
01385   std::swap(atomI, forceI);
01386   // Re-allocate force buffer if needed
01387   reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
01388   // Setup patches and atoms
01389   SimParameters *simParams = Node::Object()->simParameters;
01390   Lattice lattice = simParams->lattice;
01391   pmeRealSpaceCompute->copyAtoms(numAtoms, atoms);
01392   // Spread charge
01393   beforeWalltime = CmiWallTimer();
01394   pmeRealSpaceCompute->spreadCharge(lattice);
01395   // Send "charge grid ready to PME solver"
01396   PmeRunMsg *pmeRunMsg = new PmeRunMsg();
01397   pmeRunMsg->doVirial = doVirial;
01398   pmeRunMsg->doEnergy = doEnergy;
01399   pmeRunMsg->lattice = lattice;
01400   pmeRunMsg->numStrayAtoms = numStrayAtoms;
01401   // Reset stray atom counter
01402   numStrayAtoms = 0;
01403   switch(pmePencilType) {
01404     case 1:
01405     pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
01406     break;
01407     case 2:
01408     pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
01409     break;
01410     case 3:
01411     pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
01412     break;
01413   }
01414 }
01415 
01416 //
01417 // After PME solver is done, we return here
01418 //
01419 void ComputePmeCUDADevice::gatherForce() {
01420   traceUserBracketEvent(CUDA_EVENT_SPREADCHARGE, beforeWalltime, CmiWallTimer());
01421   beforeWalltime = CmiWallTimer();
01422   // gather (i.e. un-grid) forces
01423   SimParameters *simParams = Node::Object()->simParameters;
01424   Lattice lattice = simParams->lattice;
01425   pmeRealSpaceCompute->gatherForce(lattice, force);
01426   // Set callback that will call gatherForceDone() once gatherForce is done
01427   ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->gatherForceSetCallback(this);
01428   // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
01429   // gatherForceDone();
01430 }
01431 
01432 static inline void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param) {
01433   ComputePmeCUDADevice* c = (ComputePmeCUDADevice *)param;
01434   c->gatherForceDoneSubset(first, last);
01435 }
01436 
01437 void ComputePmeCUDADevice::gatherForceDoneSubset(int first, int last) {
01438   for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
01439     bool done = false;
01440     // ----------------------------- lock start ---------------------------
01441     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01442     //       we really would only need to each element but this would required
01443     //       numHomePatches number of locks.
01444     if (pmePencilType != 3) CmiLock(lock_numPencils);
01445     numPencils[forceI][homePatchIndex]--;
01446     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01447     if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01448     // ----------------------------- lock end  ---------------------------
01449     if (done) {
01450       // This home patch is done, launch force merging
01451       mergeForcesOnPatch(homePatchIndex);
01452     }
01453   }
01454 }
01455 
01456 void ComputePmeCUDADevice::gatherForceDone() {
01457   // Primary pencil has the forces
01458 
01459   traceUserBracketEvent(CUDA_EVENT_GATHERFORCE, beforeWalltime, CmiWallTimer());
01460 
01461   // Send forces to neighbors
01462   sendForcesToNeighbors();
01463 
01464 #if CMK_SMP && USE_CKLOOP
01465   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01466   if (useCkLoop >= 1) {
01467     CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
01468   } else
01469 #endif
01470 
01471   {
01472     // Loop through home patches and mark the primary pencil as "done"
01473     for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
01474       bool done = false;
01475       // ----------------------------- lock start ---------------------------
01476       // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01477       //       we really would only need to each element but this would required
01478       //       numHomePatches number of locks.
01479       if (pmePencilType != 3) CmiLock(lock_numPencils);
01480       numPencils[forceI][homePatchIndex]--;
01481       if (numPencils[forceI][homePatchIndex] == 0) done = true;
01482       if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01483       // ----------------------------- lock end  ---------------------------
01484       if (done) {
01485         // This home patch is done, launch force merging
01486         thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01487       }
01488     }
01489   }
01490 
01491   // In case we have no home patches, clear the primary pencil storage here
01492   if (numHomePatches == 0) {
01493     int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01494     pmeAtomStorage[forceI][pp0]->clear();
01495   }
01496 
01497 }
01498 
01499 //
01500 // After gatherForce is done, we end up here
01501 //
01502 void ComputePmeCUDADevice::sendForcesToNeighbors() {
01503   // Primary pencil has the forces
01504   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01505   int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01506   // Loop through neighboring pencils
01507   for (int z=zlo;z <= zhi;z++) {
01508     for (int y=ylo;y <= yhi;y++) {
01509       // Only send to neighbors, not self
01510       if (y != 0 || z != 0) {
01511         int pp = y-ylo + (z-zlo)*yNBlocks;
01512         int patchIndex = neighborPatchIndex[pp];
01513         int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
01514         int atomEnd   = patchPos[patchIndex];
01515         int natom = atomEnd-atomStart;
01516         // copy forces
01517         PmeForcePencilMsg *msg = new (natom, PRIORITY_SIZE) PmeForcePencilMsg;
01518         msg->numAtoms = natom;
01519         memcpy(msg->force, force+atomStart, natom*sizeof(CudaForce));
01520         // Calculate destination pencil index (dstY, dstZ) for this neighbor
01521         int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01522         int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01523         int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
01524         msg->y = dstY;
01525         msg->z = dstZ;
01526         // Store source pencil index
01527         msg->srcY = pencilIndexY;
01528         msg->srcZ = pencilIndexZ;
01529         mgrProxy[node].recvForcesFromNeighbor(msg);
01530       }
01531     }
01532   }
01533 }
01534 
01535 void ComputePmeCUDADevice::recvForcesFromNeighbor(PmeForcePencilMsg *msg) {
01536 
01537   // Source pencil index
01538   int y = msg->srcY - pencilIndexY;
01539   if (y < ylo) y += pmeGrid.yBlocks;
01540   if (y > yhi) y -= pmeGrid.yBlocks;
01541   int z = msg->srcZ - pencilIndexZ;
01542   if (z < zlo) z += pmeGrid.zBlocks;
01543   if (z > zhi) z -= pmeGrid.zBlocks;
01544 
01545   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01546     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
01547   }
01548 
01549   // Source pencil
01550   int pp = y-ylo + (z-zlo)*yNBlocks;
01551 
01552   // Store message (deleted in mergeForcesOnPatch)
01553   neighborForcePencilMsgs[pp] = msg;
01554 
01555   // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
01556   // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
01557   // neighborForcePencils[pp].numAtoms = msg->numAtoms;
01558   // neighborForcePencils[pp].y = msg->y;
01559   // neighborForcePencils[pp].z = msg->z;
01560   // neighborForcePencils[pp].srcY = msg->srcY;
01561   // neighborForcePencils[pp].srcZ = msg->srcZ;
01562   // delete msg;
01563 
01564   // numPatches = number of home patches this pencil has
01565   int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
01566   if (numPatches != homePatchIndexList[forceI][pp].size()) {
01567     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
01568   }
01569   for (int i=0;i < numPatches;i++) {
01570     // this pencil contributed to home patch with index "homePatchIndex"
01571     int homePatchIndex = homePatchIndexList[forceI][pp][i];
01572     // ----------------------------- lock start ---------------------------
01573     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01574     //       we really would only need to each element but this would required
01575     //       numHomePatches number of locks.
01576     bool done = false;
01577     CmiLock(lock_numPencils);
01578     numPencils[forceI][homePatchIndex]--;
01579     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01580     CmiUnlock(lock_numPencils);
01581     // ----------------------------- lock end  ---------------------------
01582     if (done) {
01583       // This home patch is done, launch force merging
01584       thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01585     }
01586   }
01587 
01588 }
01589 
01590 void ComputePmeCUDADevice::mergeForcesOnPatch(int homePatchIndex) {
01591   // We have all the forces for this patch => merge on a single Pe
01592 
01593   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01594 
01595   // Message that goes out to the compute
01596   PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
01597 
01598   if (pmePencilType == 3) {
01599     // 3D box => simple memory copy will do
01600     // Location of forces in the force[] array
01601     int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01602     // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
01603     int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01604     int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01605     int atomEnd   = patchPos[pencilPatchIndex];
01606     int numAtoms = atomEnd-atomStart;
01607     if (forceMsg->zeroCopy) {
01608       // Zero-copy, just pass the pointer
01609       forceMsg->force = force+atomStart;
01610     } else {
01611       memcpy(forceMsg->force, force+atomStart, numAtoms*sizeof(CudaForce));
01612     }
01613   } else {
01614 
01615     // Zero force array
01616     // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
01617     memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
01618 
01619     // Store forces from primary pencil
01620     {
01621       int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01622       int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
01623       int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01624       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01625       int atomEnd   = patchPos[pencilPatchIndex];
01626       int numAtoms = atomEnd-atomStart;
01627 
01628       // Copy in local forces that are stored in the force[] array
01629       for (int i=0;i < numAtoms;i++) {
01630         forceMsg->force[index[atomStart + i]] = force[atomStart + i];
01631       }
01632 
01633     }
01634 
01635     // Add forces from neighboring pencils
01636     for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
01637       int pp               = plList[forceI][homePatchIndex][j].pp;
01638       int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
01639 
01640       int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
01641       int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
01642       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01643       int atomEnd   = patchPos[pencilPatchIndex];
01644       int numAtoms = atomEnd-atomStart;
01645       CudaForce *dstForce = forceMsg->force;
01646       // CudaForce *srcForce = neighborForcePencils[pp].force;
01647       CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
01648 
01649       for (int i=0;i < numAtoms;i++) {
01650         dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
01651         dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
01652         dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
01653       }
01654 
01655     }
01656   }
01657 
01658   // Clear storage
01659   plList[forceI][homePatchIndex].clear();
01660 
01661   // ----------------------------- lock start ---------------------------
01662   // bool done = false;
01663   CmiLock(lock_numHomePatchesMerged);
01664   numHomePatchesMerged++;
01665   if (numHomePatchesMerged == numHomePatches) {
01666     // Reset counter
01667     numHomePatchesMerged = 0;
01668 
01669     // Delete messages
01670     for (int i=0;i < neighborForcePencilMsgs.size();i++) {
01671       if (neighborForcePencilMsgs[i] != NULL) {
01672         delete neighborForcePencilMsgs[i];
01673         neighborForcePencilMsgs[i] = NULL;
01674       }
01675     }
01676 
01677     // Done merging and sending forces => clear storage
01678     for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
01679       homePatchIndexList[forceI][pp].clear();
01680     for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
01681       pmeAtomStorage[forceI][pp]->clear();
01682 
01683   }
01684   CmiUnlock(lock_numHomePatchesMerged);
01685   // ----------------------------- lock end  ---------------------------
01686 
01687   // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
01688   // this node will then rely the message to the Pe that originally sent the atoms
01689   int pe = forceMsg->pe;
01690   if (CkNodeOf(pe) != CkMyNode())
01691     thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
01692   else
01693     sendForcesToPatch(forceMsg);
01694 
01695 }
01696 
01697 void ComputePmeCUDADevice::sendForcesToPatch(PmeForceMsg *forceMsg) {
01698   // Now we're on the node that has Pe, hence "compute" -pointer is valid
01699   int pe                  = forceMsg->pe;
01700   ComputePmeCUDA *compute = forceMsg->compute;
01701 
01702   // Store message for use in ComputePmeCUDA, where it'll also be deleted.
01703   if (compute->storePmeForceMsg(forceMsg)) {
01704     // Enqueue on the pe that sent the atoms in the first place
01705     LocalWorkMsg *lmsg = compute->localWorkMsg;
01706     CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
01707     wdProxy[pe].enqueuePme(lmsg);
01708   }
01709 }
01710 
01711 //
01712 // Received self energy from ComputePmeCUDA computes
01713 // NOTE: This should only happen once at the start of the simulation since self energy is constant
01714 //
01715 void ComputePmeCUDAMgr::recvSelfEnergy(PmeSelfEnergyMsg *msg) {
01716   // NOTE: msg is deleted in PmePencilXYZ / PmePencilX
01717   switch(pmePencilType) {
01718     case 1:
01719     pmePencilZ(0,0,0).recvSelfEnergy(msg);
01720     break;
01721     case 2:
01722     pmePencilZ(0,0,0).recvSelfEnergy(msg);
01723     break;
01724     case 3:
01725     pmePencilXYZ[0].recvSelfEnergy(msg);
01726     break;
01727   }  
01728 }
01729 #endif // NAMD_CUDA
01730 
01731 #include "ComputePmeCUDAMgr.def.h"

Generated on Tue Sep 26 01:17:12 2017 for NAMD by  doxygen 1.4.7