ComputePmeCUDAMgr.C

Go to the documentation of this file.
00001 #include <vector>
00002 #include <numeric>
00003 #include <algorithm>
00004 #include "Node.h"
00005 #include "PatchMap.h"
00006 #include "WorkDistrib.h"
00007 #include "Priorities.h"
00008 
00009 #include "CudaUtils.h"
00010 
00011 #include "SimParameters.h"
00012 #include "CudaPmeSolverUtil.h"
00013 
00014 #include "ComputePmeCUDAMgr.h"
00015 
00016 #include "CudaPmeSolver.h"
00017 #include "ComputePmeCUDA.h"
00018 
00019 #include "DeviceCUDA.h"
00020 #ifdef NAMD_CUDA
00021 #ifdef WIN32
00022 #define __thread __declspec(thread)
00023 #endif
00024 extern __thread DeviceCUDA *deviceCUDA;
00025 
00026 void createStream(cudaStream_t& stream) {
00027 #if CUDA_VERSION >= 5050
00028   int leastPriority, greatestPriority;
00029   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
00030   cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority));
00031   // cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault,leastPriority));
00032 #else
00033   cudaCheck(cudaStreamCreate(&stream));
00034 #endif
00035 }
00036 
00037 //
00038 // CUDA implementation of atom storage
00039 //
00040 class CudaPmeAtomStorage : public PmeAtomStorage {
00041 public:
00042   CudaPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
00043   ~CudaPmeAtomStorage() {
00044     if (atom != NULL) dealloc_((void *)atom);
00045     if (atomIndex != NULL) dealloc_((void *)atomIndex);
00046     if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
00047     if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
00048   }
00049 private:
00050   
00051   // Allocate array of size bytes
00052   void* alloc_(const int size) {
00053     void* p;
00054     cudaCheck(cudaMallocHost(&p, size));
00055     return p;
00056   }
00057 
00058   // Deallocate array
00059   void dealloc_(void *p) {
00060     cudaCheck(cudaFreeHost(p));
00061   }
00062 
00063 };
00064 
00065 //
00066 // CPU implementation of atom storage
00067 //
00068 class CpuPmeAtomStorage : public PmeAtomStorage {
00069 public:
00070   CpuPmeAtomStorage(const bool useIndex) : PmeAtomStorage(useIndex) {}
00071   ~CpuPmeAtomStorage() {
00072     if (atom != NULL) dealloc_((void *)atom);
00073     if (atomIndex != NULL) dealloc_((void *)atomIndex);
00074     if (overflowAtom != NULL) dealloc_((void *)overflowAtom);
00075     if (overflowAtomIndex != NULL) dealloc_((void *)overflowAtomIndex);
00076   }
00077 private:
00078   
00079   // Allocate array of size bytes
00080   void* alloc_(const int size) {
00081     return (void *)(new char[size]);
00082   }
00083 
00084   // Deallocate array
00085   void dealloc_(void *p) {
00086     delete [] (char *)p;
00087   }
00088 
00089 };
00090 
00091 PmeAtomFiler::PmeAtomFiler() {
00092   for (int p=0;p < 10;p++) {
00093     pencil[p] = NULL;
00094     pencilCapacity[p] = 0;
00095   }  
00096 }
00097 PmeAtomFiler::PmeAtomFiler(CkMigrateMessage *m) {
00098   for (int p=0;p < 10;p++) {
00099     pencil[p] = NULL;
00100     pencilCapacity[p] = 0;
00101   }
00102 }
00103 PmeAtomFiler::~PmeAtomFiler() {
00104   for (int p=0;p < 10;p++) {
00105     if (pencil[p] != NULL) delete [] pencil[p];
00106   }
00107 }
00108 
00109 //
00110 // File atoms into PME pencils. Each atom can belong to maximum 9 pencils
00111 // (oy, oz) = origin of the atoms
00112 // (miny, minz) = grid minimum corner for this patch
00113 // NOTE: This method can only be called locally from the same Pe
00114 //
00115 void PmeAtomFiler::fileAtoms(const int numAtoms, const CudaAtom* atoms, Lattice &lattice, const PmeGrid &pmeGrid,
00116   const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi) {
00117 
00118   // Make sure there's enough room in the pencil arrays
00119   for (int p=0;p < 10;p++) {
00120     if (pencil[p] != NULL && pencilCapacity[p] < numAtoms) {
00121       delete [] pencil[p];
00122       pencil[p] = NULL;
00123       pencilCapacity[p] = 0;
00124     }
00125     if (pencil[p] == NULL) {
00126       int newSize = (int)(numAtoms*1.5);
00127       pencil[p] = new int[newSize];
00128       pencilCapacity[p] = newSize;
00129     }
00130     pencilSize[p] = 0;
00131   }
00132 
00133   const float recip11 = lattice.a_r().x;
00134   const float recip22 = lattice.b_r().y;
00135   const float recip33 = lattice.c_r().z;
00136   const int order1 = pmeGrid.order - 1;
00137   const int K1 = pmeGrid.K1;
00138   const int K2 = pmeGrid.K2;
00139   const int K3 = pmeGrid.K3;
00140   const int yBlocks = pmeGrid.yBlocks;
00141   const int zBlocks = pmeGrid.zBlocks;
00142 
00143   for (int i=0;i < numAtoms;i++) {
00144     float frx, fry, frz;
00145     // PmeRealSpaceCompute::calcGridCoord(atoms[i].uix, atoms[i].uiy, atoms[i].uiz,
00146     //   K1, K2, K3, frx, fry, frz);
00147     PmeRealSpaceCompute::calcGridCoord(atoms[i].x, atoms[i].y, atoms[i].z, K1, K2, K3, frx, fry, frz);
00148     // Charge is spread in the region [y0 ... y0+order-1] x [z0 ... z0+order-1]
00149     int y0 = (int)fry;
00150     int z0 = (int)frz;
00151     if (y0 < 0 || y0 >= K2 || z0 < 0 || z0 >= K3) {
00152       // Add to "Stray pencil" and skip to next atom
00153       pencil[9][pencilSize[9]++] = i;
00154       continue;
00155       // fprintf(stderr, "%lf %lf %lf\n", atoms[i].x, atoms[i].y, atoms[i].z);
00156       // NAMD_bug("PmeAtomFiler::fileAtoms, charge out of bounds");
00157     }
00158     // Calculate pencil index for the four corners of the order X order area
00159     // The corners determine the pencil indices for this atom.
00160     int occupied = 0;
00161     int plist[4];
00162 #pragma unroll
00163     for (int j=0;j < 4;j++) {
00164 
00165       int py = getPencilIndexY(pmeGrid, (y0 + (j%2)*order1) % K2) - pencilIndexY;
00166       if (py < ylo) py += yBlocks;
00167       if (py > yhi) py -= yBlocks;
00168 
00169       int pz = getPencilIndexZ(pmeGrid, (z0 + (j/2)*order1) % K3) - pencilIndexZ;
00170       if (pz < zlo) pz += zBlocks;
00171       if (pz > zhi) pz -= zBlocks;
00172 
00173       if (py < ylo || py > yhi || pz < zlo || pz > zhi) {
00174         // Add to "Stray pencil" and skip to next atom
00175         pencil[9][pencilSize[9]++] = i;
00176         goto breakjloop;
00177         // fprintf(stderr, "py %d [%d ... %d] pz %d [%d ... %d]\n", pz, zlo, zhi);
00178         // NAMD_bug("PmeAtomFiler::fileAtoms, py,pz outside bounds");
00179       }
00180       // p = 0,1,2,3,4,5,6,7,8 (maximum range)
00181       plist[j] = (py-ylo) + (pz-zlo)*3;
00182     }
00183 
00184 #pragma unroll
00185     for (int j=0;j < 4;j++) {
00186       int p = plist[j];
00187       // pbit = 0, 2, 4, 8, 16, 32, 64, 128, 256
00188       int pbit = (1 << p);
00189       if (!(occupied & pbit)) {
00190         pencil[p][pencilSize[p]++] = i;
00191         occupied |= pbit;
00192       }
00193     }
00194 
00195 breakjloop:
00196     continue;
00197   }
00198 
00199 }
00200 
00201 //
00202 // Class constructor
00203 //
00204 ComputePmeCUDAMgr::ComputePmeCUDAMgr() {
00205   __sdag_init();
00206   numDevices = 0;
00207   numTotalPatches = 0;
00208   numNodesContributed = 0;
00209   numDevicesMax = 0;
00210 }
00211 
00212 //
00213 // Class constructor
00214 //
00215 ComputePmeCUDAMgr::ComputePmeCUDAMgr(CkMigrateMessage *) {
00216   __sdag_init();
00217   NAMD_bug("ComputePmeCUDAMgr cannot be migrated");
00218   numDevices = 0;
00219   numTotalPatches = 0;
00220   numNodesContributed = 0;
00221   numDevicesMax = 0;
00222 }
00223 
00224 //
00225 // Class destructor
00226 //
00227 ComputePmeCUDAMgr::~ComputePmeCUDAMgr() {
00228   for (int i=0;i < extraDevices.size();i++) {
00229     cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
00230     cudaCheck(cudaStreamDestroy(extraDevices[i].stream));
00231   }
00232 }
00233 
00234 //
00235 // Returns home pencil (homey, homez)
00236 // Home pencil = pencil with most overlap with this patch
00237 //
00238 void ComputePmeCUDAMgr::getHomePencil(PatchID patchID, int& homey, int& homez) {
00239   PatchMap *patchMap = PatchMap::Object();
00240 
00241   BigReal miny = patchMap->min_b(patchID);
00242   BigReal maxy = patchMap->max_b(patchID);
00243 
00244   BigReal minz = patchMap->min_c(patchID);
00245   BigReal maxz = patchMap->max_c(patchID);
00246 
00247   // Determine home pencil = pencil with most overlap
00248   
00249   // Calculate patch grid coordinates
00250   int patch_y0 = floor((miny+0.5)*pmeGrid.K2);
00251   int patch_y1 = floor((maxy+0.5)*pmeGrid.K2)-1;
00252   int patch_z0 = floor((minz+0.5)*pmeGrid.K3);
00253   int patch_z1 = floor((maxz+0.5)*pmeGrid.K3)-1;
00254 
00255   if (patch_y0 < 0 || patch_y1 >= pmeGrid.K2 || patch_z0 < 0 || patch_z1 >= pmeGrid.K3) {
00256     NAMD_bug("ComputePmeCUDAMgr::getHomePencil, patch bounds are outside grid bounds");
00257   }
00258 
00259   int maxOverlap = 0;
00260   homey = -1;
00261   homez = -1;
00262   for (int iz=0;iz < pmeGrid.zBlocks;iz++) {
00263     for (int iy=0;iy < pmeGrid.yBlocks;iy++) {
00264       int pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1;
00265       getPencilDim(pmeGrid, Perm_X_Y_Z, iy, iz,
00266         pencil_x0, pencil_x1, pencil_y0, pencil_y1, pencil_z0, pencil_z1);
00267 
00268       if (pencil_y1 - pencil_y0 < pmeGrid.order || pencil_z1 - pencil_z0 < pmeGrid.order)
00269         NAMD_bug("ComputePmeCUDAMgr::getHomePencil, pencil size must be >= PMEInterpOrder");
00270 
00271       int y0 = std::max(patch_y0, pencil_y0);
00272       int y1 = std::min(patch_y1, pencil_y1);
00273       int z0 = std::max(patch_z0, pencil_z0);
00274       int z1 = std::min(patch_z1, pencil_z1);
00275 
00276       int overlap = (y1-y0 > 0 && z1-z0 > 0) ? (y1-y0)*(z1-z0) : -1;
00277 
00278       if (overlap > maxOverlap) {
00279         maxOverlap = overlap;
00280         homey = iy;
00281         homez = iz;
00282       }
00283     }
00284   }
00285 
00286   if (homey == -1 || homez == -1)
00287     NAMD_bug("ComputePmeCUDAMgr::getHomePencil, home pencil not found");
00288 }
00289 
00290 //
00291 // Calculates maximum number of PME pencils
00292 //
00293 void ComputePmeCUDAMgr::restrictToMaxPMEPencils() {
00294   PatchMap *patchMap = PatchMap::Object();
00295   SimParameters *simParams = Node::Object()->simParameters;
00296   Lattice lattice = simParams->lattice;
00297   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
00298   BigReal sysdimc = lattice.c_r().unit() * lattice.c();
00299   BigReal cutoff = simParams->cutoff;
00300   BigReal patchdim = simParams->patchDimension;
00301   BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
00302   BigReal marginc = 0.5 * ( patchdim - cutoff ) / sysdimc;
00303   int numPatches = patchMap->numPatches();
00304 
00305   pmeGrid.xBlocks = std::min(pmeGrid.xBlocks, pmeGrid.K1);
00306 
00307   int pid = 0;
00308   while (pid < numPatches) {
00309     // Get home pencil
00310     int homey, homez;
00311     getHomePencil(pid, homey, homez);
00312     // Y
00313     {
00314       BigReal miny = patchMap->min_b(pid);
00315       BigReal maxy = patchMap->max_b(pid);
00316       // min2 (max2) is smallest (largest) grid line for this patch
00317       int min2 = ((int) floor(pmeGrid.K2 * (miny+0.5 - marginb)));
00318       int max2 = ((int) floor(pmeGrid.K2 * (maxy+0.5 + marginb))) + (pmeGrid.order - 1);
00319       // Restrict grid lines to [0 ... pmeGrid.K2-1]
00320       if (min2 < 0) min2 += pmeGrid.K2;
00321       if (max2 >= pmeGrid.K2) max2 -= pmeGrid.K2;
00322       // Get pencil indices for the grid lines
00323       int min2pi = getPencilIndexY(pmeGrid, min2);
00324       int max2pi = getPencilIndexY(pmeGrid, max2);
00325       // Distance from home pencil
00326       int dmin2pi = homey - min2pi;
00327       if (dmin2pi < 0) dmin2pi += pmeGrid.yBlocks;
00328       if (dmin2pi < 0)
00329         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin2pi");
00330       // If distance is > 1, must decrease the number of y-pencils and try again
00331       if (dmin2pi > 1) {
00332         pmeGrid.yBlocks--;
00333         if (pmeGrid.yBlocks <= 0) break;
00334         continue;
00335       }
00336       int dmax2pi = max2pi - homey;
00337       if (dmax2pi < 0) dmax2pi += pmeGrid.yBlocks;
00338       if (dmax2pi < 0)
00339         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax2pi");
00340       // If distance is > 1, must decrease the number of y-pencils and try again
00341       if (dmax2pi > 1) {
00342         pmeGrid.yBlocks--;
00343         if (pmeGrid.yBlocks <= 0) break;
00344         continue;
00345       }
00346     }
00347 
00348     // Z
00349     {
00350       BigReal minz = patchMap->min_c(pid);
00351       BigReal maxz = patchMap->max_c(pid);
00352       // min3 (max3) is smallest (largest) grid line for this patch
00353       int min3 = ((int) floor(pmeGrid.K3 * (minz+0.5 - marginc)));
00354       int max3 = ((int) floor(pmeGrid.K3 * (maxz+0.5 + marginc))) + (pmeGrid.order - 1);
00355       // Restrict grid lines to [0 ... pmeGrid.K3-1]
00356       if (min3 < 0) min3 += pmeGrid.K3;
00357       if (max3 >= pmeGrid.K3) max3 -= pmeGrid.K3;
00358       // Get pencil indices for the grid lines
00359       int min3pi = getPencilIndexZ(pmeGrid, min3);
00360       int max3pi = getPencilIndexZ(pmeGrid, max3);
00361       // Distance from home pencil
00362       int dmin3pi = homez - min3pi;
00363       if (dmin3pi < 0) dmin3pi += pmeGrid.zBlocks;
00364       if (dmin3pi < 0)
00365         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmin3pi");
00366       // If distance is > 1, must decrease the number of z-pencils and try again
00367       if (dmin3pi > 1) {
00368         pmeGrid.zBlocks--;
00369         if (pmeGrid.zBlocks <= 0) break;
00370         continue;
00371       }
00372       int dmax3pi = max3pi - homez;
00373       if (dmax3pi < 0) dmax3pi += pmeGrid.zBlocks;
00374       if (dmax3pi < 0)
00375         NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, Error in dmax3pi");
00376       // If distance is > 1, must decrease the number of z-pencils and try again
00377       if (dmax3pi > 1) {
00378         pmeGrid.zBlocks--;
00379         if (pmeGrid.zBlocks <= 0) break;
00380         continue;
00381       }
00382     }
00383 
00384     pid++;
00385   }
00386 
00387   // if (CkMyNode() == 0)
00388   //   fprintf(stderr, "pmeGrid.yBlocks %d pmeGrid.zBlocks %d\n", pmeGrid.yBlocks, pmeGrid.zBlocks);
00389 
00390   if (pmeGrid.xBlocks <= 0 || pmeGrid.yBlocks <= 0 || pmeGrid.zBlocks <= 0)
00391     NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, zero PME pencils found");
00392 
00393   if (pmeGrid.xBlocks > pmeGrid.K1 || pmeGrid.yBlocks > pmeGrid.K2|| pmeGrid.zBlocks > pmeGrid.K3)
00394     NAMD_bug("ComputePmeCUDAMgr::restrictToMaxPMEPencils, unable to restrict number of PME pencils");
00395 }
00396 
00397 //
00398 // Sets up pencils. May be called multiple times. 
00399 //
00400 void ComputePmeCUDAMgr::setupPencils() {
00401   SimParameters *simParams = Node::Object()->simParameters;
00402 
00403   pmeGrid.K1 = simParams->PMEGridSizeX;
00404   pmeGrid.K2 = simParams->PMEGridSizeY;
00405   pmeGrid.K3 = simParams->PMEGridSizeZ;
00406   pmeGrid.order = simParams->PMEInterpOrder;
00407   pmeGrid.dim2 = pmeGrid.K2;
00408   pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
00409 
00410   // Count the total number of devices assuming all nodes have the same number as this node
00411   // NOTE: This should be changed in the future to support heterogeneous nodes!!!
00412   int numDevicesTmp = deviceCUDA->getNumDevice();
00413 
00414   int numDeviceTot = CkNumNodes() * numDevicesTmp;
00415   // Use approximately 1/4th of the devices for PME
00416   int numDeviceToUse = std::max(1, numDeviceTot/4);
00417 
00418   if (numDeviceToUse < 4) {
00419     // 2D Slab
00420     pmeGrid.yBlocks = 1;
00421     pmeGrid.xBlocks = pmeGrid.zBlocks = numDeviceToUse;
00422   } else {
00423     // 1D Pencil
00424     pmeGrid.yBlocks = (int)sqrt((double)numDeviceToUse);
00425     pmeGrid.zBlocks = numDeviceToUse/pmeGrid.yBlocks;
00426     pmeGrid.xBlocks = pmeGrid.zBlocks;
00427   }
00428 
00429   if ( simParams->PMEPencilsX > 0 ) pmeGrid.xBlocks = simParams->PMEPencilsX;
00430   if ( simParams->PMEPencilsY > 0 ) pmeGrid.yBlocks = simParams->PMEPencilsY;
00431   if ( simParams->PMEPencilsZ > 0 ) pmeGrid.zBlocks = simParams->PMEPencilsZ;
00432 
00433   // Restrict number of pencils to the maximum number
00434   restrictToMaxPMEPencils();
00435 
00436   // Fix pencil numbers if they don't make sense w.r.t. number of devices
00437   if (pmeGrid.yBlocks == 1) {
00438     // 2D Slab
00439     if (pmeGrid.xBlocks > numDeviceTot) pmeGrid.xBlocks = numDeviceTot;
00440     if (pmeGrid.zBlocks > numDeviceTot) pmeGrid.zBlocks = numDeviceTot;
00441   } else {
00442     // 1D Pencil
00443     if (pmeGrid.yBlocks*pmeGrid.zBlocks > numDeviceTot ||
00444       pmeGrid.xBlocks*pmeGrid.zBlocks > numDeviceTot ||
00445       pmeGrid.xBlocks*pmeGrid.yBlocks > numDeviceTot) {
00446       pmeGrid.yBlocks = std::min(pmeGrid.yBlocks, (int)sqrt((double)numDeviceTot));
00447       pmeGrid.zBlocks = std::min(pmeGrid.zBlocks, numDeviceTot/pmeGrid.yBlocks);
00448     }
00449     pmeGrid.xBlocks = std::min(pmeGrid.yBlocks, pmeGrid.zBlocks);
00450   }
00451 
00452   // Here (block1, block2, block3) define the size of charge grid pencil in each direction
00453   pmeGrid.block1 = ( pmeGrid.K1 + pmeGrid.xBlocks - 1 ) / pmeGrid.xBlocks;
00454   pmeGrid.block2 = ( pmeGrid.K2 + pmeGrid.yBlocks - 1 ) / pmeGrid.yBlocks;
00455   pmeGrid.block3 = ( pmeGrid.K3 + pmeGrid.zBlocks - 1 ) / pmeGrid.zBlocks;
00456 
00457   // Determine type of FFT
00458   if (pmeGrid.xBlocks == 1 && pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1) {
00459     // Single block => 3D FFT
00460     pmePencilType = 3;
00461   } else if (pmeGrid.yBlocks == 1) {
00462     // Blocks in all but y-dimension => 2D FFT
00463     pmePencilType = 2;
00464   } else {
00465     // Blocks in all dimensions => 1D FFT
00466     pmePencilType = 1;
00467   }
00468 
00469   //--------------------------------------------------------------------------
00470   // Map pencils into Pes
00471   xPes.resize(pmeGrid.yBlocks*pmeGrid.zBlocks);
00472 
00473   if (pmePencilType == 1 || pmePencilType == 2) {
00474     zPes.resize(pmeGrid.xBlocks*pmeGrid.yBlocks);
00475   }
00476   if (pmePencilType == 1) {
00477     yPes.resize(pmeGrid.xBlocks*pmeGrid.zBlocks);
00478   }  
00479 
00480     // i % numDeviceTot                              = device index
00481     // (i % numDeviceTot)/deviceCUDA->getNumDevice() = node index
00482     // (i % CkNodeSize(node))                        = pe displacement
00483   for (int i=0;i < xPes.size();i++) {
00484     int node = (i % numDeviceTot)/numDevicesTmp;
00485     xPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
00486   }
00487   for (int i=0;i < yPes.size();i++) {
00488     int node = (i % numDeviceTot)/numDevicesTmp;
00489     yPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
00490   }
00491   for (int i=0;i < zPes.size();i++) {
00492     int node = (i % numDeviceTot)/numDevicesTmp;
00493     zPes[i] = CkNodeFirst(node) + (i + 0) % CkNodeSize(node);
00494   }
00495 
00496   // char peStr[256];
00497   // char *p = peStr;
00498   // p += sprintf(p, "%2d | xPes", CkMyPe());
00499   // for (int i=0;i < xPes.size();i++)
00500   //   p += sprintf(p, " %d", xPes[i]);
00501   // p += sprintf(p, " yPes");
00502   // for (int i=0;i < yPes.size();i++)
00503   //   p += sprintf(p, " %d", yPes[i]);
00504   // p += sprintf(p, " zPes");
00505   // for (int i=0;i < zPes.size();i++)
00506   //   p += sprintf(p, " %d", zPes[i]);
00507   // fprintf(stderr, "%s | %d %d\n",peStr, CkNodeFirst(CkMyNode()), CkNodeSize(CkMyNode()));
00508 
00509   //--------------------------------------------------------------------------
00510   // Build global node list for x-pencils
00511   nodeDeviceList.resize(xPes.size());
00512   numDevices = 0;
00513   for (int k=0;k < xPes.size();k++) {
00514     nodeDeviceList[k].node = CkNodeOf(xPes[k]);
00515     nodeDeviceList[k].device = -1;
00516     if (nodeDeviceList[k].node == CkMyNode()) {
00517       nodeDeviceList[k].device = numDevices++;
00518     }
00519   }
00520 
00521   ijPencilX.clear();
00522   ijPencilY.clear();
00523   ijPencilZ.clear();
00524 
00525   // Construct list of pencil coordinates (i,j) for each device held by this node
00526   for (int k=0;k < xPes.size();k++) {
00527     if (CkMyNode() == CkNodeOf(xPes[k])) {
00528       IJ ij;
00529       ij.i = k % pmeGrid.yBlocks;
00530       ij.j = k / pmeGrid.yBlocks;
00531       ijPencilX.push_back(ij);
00532     }
00533   }
00534   if (ijPencilX.size() != numDevices)
00535     NAMD_bug("ComputePmeCUDAMgr::setupPencils, error setting up x-pencils and devices");
00536 
00537   int numDevicesY = 0;
00538   for (int k=0;k < yPes.size();k++) {
00539     if (CkMyNode() == CkNodeOf(yPes[k])) {
00540       IJ ij;
00541       ij.i = k % pmeGrid.xBlocks;
00542       ij.j = k / pmeGrid.xBlocks;
00543       ijPencilY.push_back(ij);
00544       numDevicesY++;
00545     }
00546   }
00547 
00548   int numDevicesZ = 0;
00549   for (int k=0;k < zPes.size();k++) {
00550     if (CkMyNode() == CkNodeOf(zPes[k])) {
00551       IJ ij;
00552       ij.i = k % pmeGrid.xBlocks;
00553       ij.j = k / pmeGrid.xBlocks;
00554       ijPencilZ.push_back(ij);
00555       numDevicesZ++;
00556     }
00557   }  
00558 }
00559 
00560 //
00561 // Returns true if PE "pe" is used in PME
00562 //
00563 bool ComputePmeCUDAMgr::isPmePe(int pe) {
00564   for (int i=0;i < xPes.size();i++) {
00565     if (pe == xPes[i]) return true;
00566   }
00567   return false;
00568 }
00569 
00570 //
00571 // Returns true if node "node" is used for PME
00572 //
00573 bool ComputePmeCUDAMgr::isPmeNode(int node) {
00574   for (int i=0;i < nodeDeviceList.size();i++) {
00575     if (nodeDeviceList[i].node == node) {
00576       return true;
00577     }
00578   }
00579   return false;
00580 }
00581 
00582 //
00583 // Returns true if device "deviceID" is used for PME
00584 //
00585 bool ComputePmeCUDAMgr::isPmeDevice(int deviceID) {
00586   for (int i=0;i < nodeDeviceList.size();i++) {
00587     if (deviceCUDA->getDeviceIDbyRank(nodeDeviceList[i].device % deviceCUDA->getNumDevice()) == deviceID) {
00588       return true;
00589     }
00590   }
00591   return false;
00592 }
00593 
00594 //
00595 // Initialize compute manager
00596 // This gets called on one Pe on each node from Node::startup()
00597 //
00598 void ComputePmeCUDAMgr::initialize(CkQdMsg *msg) {
00599         if (msg != NULL) delete msg;
00600 
00601   setupPencils();
00602 
00603   if ( ! CkMyNode() ) {
00604     iout << iINFO << "PME using " << pmeGrid.xBlocks << " x " <<
00605       pmeGrid.yBlocks << " x " << pmeGrid.zBlocks <<
00606       " pencil grid for FFT and reciprocal sum.\n" << endi;
00607   }
00608 
00609   // Initialize list that contains the number of home patches for each device on this manager
00610   numHomePatchesList.resize(numDevices, 0);
00611 
00612   //--------------------------------------------------------------------------
00613   // Create devices and atom filer
00614   // numDevices = number of devices we'll be using, possibly different on each node
00615   // Use root node to compute the maximum number of devices in use over all nodes
00616   thisProxy[0].initializeDevicesAndAtomFiler(new NumDevicesMsg(numDevices));
00617   //--------------------------------------------------------------------------
00618 
00619   if (CkMyNode() == 0) {
00620 
00621     if (pmePencilType == 3) {
00622                 // Single block => 3D FFT
00623       CProxy_PmePencilXYZMap xyzMap = CProxy_PmePencilXYZMap::ckNew(xPes[0]);
00624       CkArrayOptions xyzOpts(1);
00625       xyzOpts.setMap(xyzMap);
00626       xyzOpts.setAnytimeMigration(false);
00627       xyzOpts.setStaticInsertion(true);
00628       pmePencilXYZ = CProxy_CudaPmePencilXYZ::ckNew(xyzOpts);
00629       pmePencilXYZ[0].initialize(new CudaPmeXYZInitMsg(pmeGrid));
00630       thisProxy.recvPencils(pmePencilXYZ);
00631     } else if (pmePencilType == 2) {
00632       // Blocks in all but y-dimension => 2D FFT
00633       CProxy_PmePencilXYMap xyMap = CProxy_PmePencilXYMap::ckNew(xPes);
00634       CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
00635       CkArrayOptions xyOpts(1, 1, pmeGrid.zBlocks);
00636       CkArrayOptions zOpts(pmeGrid.xBlocks, 1, 1);
00637       xyOpts.setMap(xyMap);
00638       zOpts.setMap(zMap);
00639       xyOpts.setAnytimeMigration(false);
00640       zOpts.setAnytimeMigration(false);
00641       xyOpts.setStaticInsertion(true);
00642       zOpts.setStaticInsertion(true);
00643       pmePencilXY = CProxy_CudaPmePencilXY::ckNew(xyOpts);
00644       pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
00645       // Send pencil proxies to other nodes
00646       thisProxy.recvPencils(pmePencilXY, pmePencilZ);
00647       pmePencilXY.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
00648       pmePencilZ.initialize(new CudaPmeXYInitMsg(pmeGrid, pmePencilXY, pmePencilZ, xyMap, zMap));
00649     } else {
00650                 // Blocks in all dimensions => 1D FFT
00651       CProxy_PmePencilXMap xMap = CProxy_PmePencilXMap::ckNew(1, 2, pmeGrid.yBlocks, xPes);
00652       CProxy_PmePencilXMap yMap = CProxy_PmePencilXMap::ckNew(0, 2, pmeGrid.xBlocks, yPes);
00653       CProxy_PmePencilXMap zMap = CProxy_PmePencilXMap::ckNew(0, 1, pmeGrid.xBlocks, zPes);
00654       CkArrayOptions xOpts(1, pmeGrid.yBlocks, pmeGrid.zBlocks);
00655       CkArrayOptions yOpts(pmeGrid.xBlocks, 1, pmeGrid.zBlocks);
00656                 CkArrayOptions zOpts(pmeGrid.xBlocks, pmeGrid.yBlocks, 1);
00657       xOpts.setMap(xMap);
00658       yOpts.setMap(yMap);
00659       zOpts.setMap(zMap);
00660       xOpts.setAnytimeMigration(false);
00661       yOpts.setAnytimeMigration(false);
00662       zOpts.setAnytimeMigration(false);
00663       xOpts.setStaticInsertion(true);
00664       yOpts.setStaticInsertion(true);
00665       zOpts.setStaticInsertion(true);
00666       pmePencilX = CProxy_CudaPmePencilX::ckNew(xOpts);
00667       pmePencilY = CProxy_CudaPmePencilY::ckNew(yOpts);
00668       pmePencilZ = CProxy_CudaPmePencilZ::ckNew(zOpts);
00669       // Send pencil proxies to other nodes
00670       thisProxy.recvPencils(pmePencilX, pmePencilY, pmePencilZ);
00671       pmePencilX.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
00672       pmePencilY.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
00673       pmePencilZ.initialize(new CudaPmeXInitMsg(pmeGrid, pmePencilX, pmePencilY, pmePencilZ, xMap, yMap, zMap));
00674     }
00675   }
00676 
00677 }
00678 
00679 void ComputePmeCUDAMgr::createDevicesAndAtomFiler() {
00680   if (CkMyNode() != 0)
00681     NAMD_bug("ComputePmeCUDAMgr::createDevicesAndAtomFiler can only be called on root node");
00682 
00683   // Root node creates all device proxies
00684   // NOTE: Only root node has numDevicesMax
00685   RecvDeviceMsg* msg = new (numDevicesMax, PRIORITY_SIZE) RecvDeviceMsg();
00686   msg->numDevicesMax = numDevicesMax;
00687   for (int i=0;i < numDevicesMax;i++) {
00688     CProxy_ComputePmeCUDADevice dev = CProxy_ComputePmeCUDADevice::ckNew();
00689     memcpy(&msg->dev[i], &dev, sizeof(CProxy_ComputePmeCUDADevice));
00690   }
00691   thisProxy.recvDevices(msg);
00692 
00693   CProxy_PmeAtomFiler filer = CProxy_PmeAtomFiler::ckNew();
00694   thisProxy.recvAtomFiler(filer);
00695 
00696 }
00697 
00698 void ComputePmeCUDAMgr::recvAtomFiler(CProxy_PmeAtomFiler filer) {
00699   pmeAtomFiler = filer;
00700 }
00701 
00702 void ComputePmeCUDAMgr::recvDevices(RecvDeviceMsg* msg) {
00703   numDevicesMax = msg->numDevicesMax;
00704   if (numDevices > numDevicesMax)
00705     NAMD_bug("ComputePmeCUDAMgr::recvDevices, numDevices > numDevicesMax");
00706   deviceProxy.resize(numDevices);
00707   for (int i=0;i < numDevices;i++) {
00708     deviceProxy[i] = msg->dev[i];
00709   }
00710   delete msg;
00711 }
00712 
00713 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXYZ xyz) {
00714   pmePencilXYZ = xyz;
00715 }
00716 
00717 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilXY xy, CProxy_CudaPmePencilZ z) {
00718   pmePencilXY = xy;
00719   pmePencilZ = z;
00720 }
00721 
00722 void ComputePmeCUDAMgr::recvPencils(CProxy_CudaPmePencilX x, CProxy_CudaPmePencilY y, CProxy_CudaPmePencilZ z) {
00723   pmePencilX = x;
00724   pmePencilY = y;
00725   pmePencilZ = z;
00726 }
00727 
00728 //
00729 // Initialize pencils on this node
00730 // This gets called on one rank on each node
00731 //
00732 void ComputePmeCUDAMgr::initialize_pencils(CkQdMsg *msg) {
00733   if (msg != NULL) delete msg;
00734 
00735   int numDevicesTmp = deviceCUDA->getNumDevice();
00736 
00737   // Initialize device proxies for real-space interfacing
00738   for (int i=0;i < ijPencilX.size();i++) {
00739     // NOTE: i is here the device ID
00740     int deviceID = deviceCUDA->getDeviceIDbyRank(i % numDevicesTmp);
00741     deviceProxy[i].ckLocalBranch()->initialize(pmeGrid, ijPencilX[i].i, ijPencilX[i].j,
00742       deviceID, pmePencilType, thisProxy, pmeAtomFiler);
00743     if (pmePencilType == 1) {
00744       deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilX);
00745     } else if (pmePencilType == 2) {
00746       deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXY);
00747     } else {
00748       deviceProxy[i].ckLocalBranch()->setPencilProxy(pmePencilXYZ);
00749     }
00750   }
00751 
00752   // Use above initialized device proxies for the PME pencils that interface with real-space
00753   for (int i=0;i < ijPencilX.size();i++) {
00754     if (pmePencilType == 1) {
00755       pmePencilX(0, ijPencilX[i].i, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
00756     } else if (pmePencilType == 2) {
00757       pmePencilXY(0, 0, ijPencilX[i].j).initializeDevice(new InitDeviceMsg(deviceProxy[i]));
00758     } else {
00759       pmePencilXYZ[0].initializeDevice(new InitDeviceMsg(deviceProxy[i]));
00760     }
00761   }
00762 
00763   // Create extra devices for Y and Z pencils if necessary
00764   int n = std::max(ijPencilY.size(), ijPencilZ.size());
00765   if (n > ijPencilX.size()) {
00766     int nextra = n - ijPencilX.size();
00767     extraDevices.resize(nextra);
00768     for (int i=0;i < nextra;i++) {
00769       extraDevices[i].deviceID = deviceCUDA->getDeviceIDbyRank((i + ijPencilX.size()) % numDevicesTmp);
00770       cudaCheck(cudaSetDevice(extraDevices[i].deviceID));
00771       createStream(extraDevices[i].stream);
00772     }
00773   }
00774 
00775   // Initialize Y pencils
00776   for (int i=0;i < ijPencilY.size();i++) {
00777     int deviceID;
00778     cudaStream_t stream;
00779     if (i < ijPencilX.size()) {
00780       deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
00781       stream   = deviceProxy[i].ckLocalBranch()->getStream();
00782     } else {
00783       deviceID = extraDevices[i-ijPencilX.size()].deviceID;
00784       stream   = extraDevices[i-ijPencilX.size()].stream;
00785     }
00786     pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy));
00787   }
00788 
00789   // Initialize Z pencils
00790   for (int i=0;i < ijPencilZ.size();i++) {
00791     int deviceID;
00792     cudaStream_t stream;
00793     if (i < ijPencilX.size()) {
00794       deviceID = deviceProxy[i].ckLocalBranch()->getDeviceID();
00795       stream   = deviceProxy[i].ckLocalBranch()->getStream();
00796     } else {
00797       deviceID = extraDevices[i-ijPencilX.size()].deviceID;
00798       stream   = extraDevices[i-ijPencilX.size()].stream;
00799     }
00800     pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).initializeDevice(new InitDeviceMsg2(deviceID, stream, thisProxy));
00801   }
00802 
00803 }
00804 
00805 //
00806 // Activate (start) pencils
00807 // This gets called on rank 0 Pe on each node
00808 //
00809 void ComputePmeCUDAMgr::activate_pencils(CkQdMsg *msg) {
00810   if (msg != NULL) delete msg;
00811 
00812   for (int device=0;device < numDevices;device++) {
00813     deviceProxy[device].ckLocalBranch()->activate_pencils();
00814   }
00815 
00816   for (int i=0;i < ijPencilY.size();i++) {
00817     PmeStartMsg* pmeStartYMsg = new PmeStartMsg();
00818     pmeStartYMsg->data = NULL;
00819     pmeStartYMsg->dataSize = 0;
00820     pmePencilY(ijPencilY[i].i, 0, ijPencilY[i].j).start(pmeStartYMsg);
00821   }
00822 
00823   for (int i=0;i < ijPencilZ.size();i++) {
00824     PmeStartMsg* pmeStartZMsg = new PmeStartMsg();
00825     pmeStartZMsg->data = NULL;
00826     pmeStartZMsg->dataSize = 0;
00827     pmePencilZ(ijPencilZ[i].i, ijPencilZ[i].j, 0).start(pmeStartZMsg);
00828   }
00829 
00830 }
00831 
00832 //
00833 // Returns node that contains x-pencil i,j
00834 //
00835 int ComputePmeCUDAMgr::getNode(int i, int j) {
00836   if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
00837     NAMD_bug("ComputePmeCUDAMgr::getNode, pencil index out of bounds");
00838   int ind = i + j*pmeGrid.yBlocks;
00839   return nodeDeviceList[ind].node;
00840 }
00841 
00842 //
00843 // Returns home node for a patch
00844 //
00845 int ComputePmeCUDAMgr::getHomeNode(PatchID patchID) {
00846   int homey, homez;
00847   getHomePencil(patchID, homey, homez);
00848   return getNode(homey, homez);
00849 }
00850 
00851 //
00852 // Returns device index on this node that contains x-pencil i,j
00853 //
00854 int ComputePmeCUDAMgr::getDevice(int i, int j) {
00855   if (i < 0 || i >= pmeGrid.yBlocks || j < 0 || j >= pmeGrid.zBlocks)
00856     NAMD_bug("ComputePmeCUDAMgr::getDevice, pencil index out of bounds");
00857   int ind = i + j*pmeGrid.yBlocks;
00858   int device = nodeDeviceList[ind].device;
00859   if (device == -1)
00860     NAMD_bug("ComputePmeCUDAMgr::getDevice, no device found");
00861   return device;
00862 }
00863 
00864 //
00865 // Returns device index on this node that contains y-pencil i,j
00866 //
00867 int ComputePmeCUDAMgr::getDevicePencilY(int i, int j) {
00868   if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.zBlocks)
00869     NAMD_bug("ComputePmeCUDAMgr::getDevicePencilY, pencil index out of bounds");
00870   for (int device=0;device < ijPencilY.size();device++) {
00871     if (ijPencilY[device].i == i && ijPencilY[device].j == j) return device;
00872   }
00873   char str[256];
00874   sprintf(str, "ComputePmeCUDAMgr::getDevicePencilY, no device found at i %d j %d",i,j);
00875   NAMD_bug(str);
00876   return -1;
00877 }
00878 
00879 //
00880 // Returns device index on this node that contains z-pencil i,j
00881 //
00882 int ComputePmeCUDAMgr::getDevicePencilZ(int i, int j) {
00883   if (i < 0 || i >= pmeGrid.xBlocks || j < 0 || j >= pmeGrid.yBlocks)
00884     NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, pencil index out of bounds");
00885   for (int device=0;device < ijPencilZ.size();device++) {
00886     if (ijPencilZ[device].i == i && ijPencilZ[device].j == j) return device;
00887   }
00888   NAMD_bug("ComputePmeCUDAMgr::getDevicePencilZ, no device found");
00889   return -1;
00890 }
00891 
00892 //
00893 // Returns device ID on this node that contains x-pencil i,j
00894 //
00895 int ComputePmeCUDAMgr::getDeviceIDPencilX(int i, int j) {
00896   int device = getDevice(i, j);
00897   return deviceProxy[device].ckLocalBranch()->getDeviceID();
00898 }
00899 
00900 //
00901 // Returns device ID on this node that contains y-pencil i,j
00902 //
00903 int ComputePmeCUDAMgr::getDeviceIDPencilY(int i, int j) {
00904   int device = getDevicePencilY(i, j);
00905   return deviceProxy[device].ckLocalBranch()->getDeviceID();
00906 }
00907 
00908 //
00909 // Returns device ID on this node that contains z-pencil i,j
00910 //
00911 int ComputePmeCUDAMgr::getDeviceIDPencilZ(int i, int j) {
00912   int device = getDevicePencilZ(i, j);
00913   return deviceProxy[device].ckLocalBranch()->getDeviceID();
00914 }
00915 
00916 //
00917 // Skip this round of PME, call skip on all Z-pencils (this is needed to get the reductions submitted)
00918 //
00919 void ComputePmeCUDAMgr::skip() {
00920   switch(pmePencilType) {
00921     case 1:
00922     pmePencilZ.skip();
00923     break;
00924     case 2:
00925     pmePencilZ.skip();
00926     break;
00927     case 3:
00928     pmePencilXYZ[0].skip();
00929     break;
00930   }
00931 }
00932 
00933 void ComputePmeCUDAMgr::recvAtoms(PmeAtomMsg *msg) {
00934   int device = getDevice(msg->i, msg->j);
00935   deviceProxy[device].ckLocalBranch()->recvAtoms(msg);
00936 }
00937 
00938 ComputePmeCUDADevice::ComputePmeCUDADevice() {
00939   // __sdag_init();
00940   numHomePatches = 0;
00941   forceCapacity = 0;
00942   force = NULL;
00943   pmeRealSpaceCompute = NULL;
00944   streamCreated = false;
00945   lock_numHomePatchesMerged = CmiCreateLock();
00946   lock_numPencils = CmiCreateLock();
00947   lock_numNeighborsRecv = CmiCreateLock();
00948   lock_recvAtoms = CmiCreateLock();
00949   numNeighborsExpected = 0;
00950   numStrayAtoms = 0;
00951   // Reset counters
00952   numNeighborsRecv = 0;
00953   numHomePatchesRecv = 0;
00954   numHomePatchesMerged = 0;
00955   atomI = 0;
00956   forceI = 1;
00957 }
00958 
00959 ComputePmeCUDADevice::ComputePmeCUDADevice(CkMigrateMessage *m) {
00960   // __sdag_init();
00961   numHomePatches = 0;
00962   forceCapacity = 0;
00963   force = NULL;
00964   pmeRealSpaceCompute = NULL;
00965   streamCreated = false;
00966   lock_numHomePatchesMerged = CmiCreateLock();
00967   lock_numPencils = CmiCreateLock();
00968   lock_numNeighborsRecv = CmiCreateLock();
00969   lock_recvAtoms = CmiCreateLock();
00970   numNeighborsExpected = 0;
00971   numStrayAtoms = 0;
00972   // Reset counters
00973   numNeighborsRecv = 0;
00974   numHomePatchesRecv = 0;
00975   numHomePatchesMerged = 0;
00976   atomI = 0;
00977   forceI = 1;
00978 }
00979 
00980 ComputePmeCUDADevice::~ComputePmeCUDADevice() {
00981   if (streamCreated) {
00982     cudaCheck(cudaSetDevice(deviceID));
00983     cudaCheck(cudaStreamDestroy(stream));
00984   }
00985   for (int j=0;j < 2;j++)
00986     for (int i=0;i < pmeAtomStorage[j].size();i++) {
00987       if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
00988     }
00989   if (force != NULL) deallocate_host<CudaForce>(&force);
00990   if (pmeRealSpaceCompute != NULL) delete pmeRealSpaceCompute;
00991   CmiDestroyLock(lock_numHomePatchesMerged);
00992   CmiDestroyLock(lock_numPencils);
00993   CmiDestroyLock(lock_numNeighborsRecv);
00994   CmiDestroyLock(lock_recvAtoms);
00995 }
00996 
00997 void ComputePmeCUDADevice::initialize(PmeGrid& pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in,
00998   int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in,
00999   CProxy_PmeAtomFiler pmeAtomFiler_in) {
01000 
01001   deviceID = deviceID_in;
01002   cudaCheck(cudaSetDevice(deviceID));
01003   pmePencilType = pmePencilType_in;
01004   pmeGrid = pmeGrid_in;
01005   pencilIndexY = pencilIndexY_in;
01006   pencilIndexZ = pencilIndexZ_in;
01007   mgrProxy = mgrProxy_in;
01008   pmeAtomFiler = pmeAtomFiler_in;
01009   // Size of the neighboring pencil grid, max 3x3
01010   yNBlocks = std::min(pmeGrid.yBlocks, 3);
01011   zNBlocks = std::min(pmeGrid.zBlocks, 3);
01012   // Local pencil is at y=0,z=0
01013   if (yNBlocks == 1) {
01014     ylo = 0;
01015     yhi = 0;
01016   } else if (yNBlocks == 2) {
01017     ylo = -1;
01018     yhi = 0;
01019   } else {
01020     ylo = -1;
01021     yhi = 1;
01022   }
01023   if (zNBlocks == 1) {
01024     zlo = 0;
01025     zhi = 0;
01026   } else if (zNBlocks == 2) {
01027     zlo = -1;
01028     zhi = 0;
01029   } else {
01030     zlo = -1;
01031     zhi = 1;
01032   }
01033   
01034   neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
01035   // neighborForcePencils.resize(yNBlocks*zNBlocks);
01036   for (int j=0;j < 2;j++)
01037     homePatchIndexList[j].resize(yNBlocks*zNBlocks);
01038   neighborPatchIndex.resize(yNBlocks*zNBlocks);
01039 
01040   pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
01041   for (int j=0;j < 2;j++) {
01042     pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
01043     for (int z=zlo;z <= zhi;z++) {
01044       for (int y=ylo;y <= yhi;y++) {
01045         int pp = y-ylo + (z-zlo)*yNBlocks;
01046         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01047         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01048         if (y == 0 && z == 0) {
01049           // Primary pencil
01050           pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
01051         } else {
01052           pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
01053         }
01054         pmeAtomStorageAllocatedHere[pp] = true;
01055       }
01056     }
01057   }
01058 
01059   // Create stream for this device
01060   createStream(stream);
01061   streamCreated = true;
01062   pmeRealSpaceCompute = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ,
01063     deviceID, stream);
01064 
01065 }
01066 
01067 cudaStream_t ComputePmeCUDADevice::getStream() {
01068   return stream;
01069 }
01070 
01071 int ComputePmeCUDADevice::getDeviceID() {
01072   return deviceID;
01073 }
01074 
01075 CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy() {
01076   return mgrProxy;
01077 }
01078 
01079 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXYZ pmePencilXYZ_in) {
01080   if (pmePencilType != 3)
01081     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
01082   pmePencilXYZ = pmePencilXYZ_in;
01083 }
01084 
01085 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilXY pmePencilXY_in) {
01086   if (pmePencilType != 2)
01087     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
01088   pmePencilXY = pmePencilXY_in;
01089 }
01090 
01091 void ComputePmeCUDADevice::setPencilProxy(CProxy_CudaPmePencilX pmePencilX_in) {
01092   if (pmePencilType != 1)
01093     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
01094   pmePencilX = pmePencilX_in;
01095 }
01096 
01097 void ComputePmeCUDADevice::activate_pencils() {
01098   if (pmePencilType == 1) {
01099     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01100     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01101     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01102     pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
01103   } else if (pmePencilType == 2) {
01104     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01105     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01106     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01107     pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
01108   } else if (pmePencilType == 3) {
01109     PmeStartMsg* pmeStartMsg = new PmeStartMsg();
01110     pmeStartMsg->data = pmeRealSpaceCompute->getData();
01111     pmeStartMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01112     pmePencilXYZ[0].start(pmeStartMsg);
01113   }
01114 }
01115 
01116 void ComputePmeCUDADevice::initializePatches(int numHomePatches_in) {
01117   numHomePatches = numHomePatches_in;
01118   for (int j=0;j < 2;j++)
01119     numPencils[j].resize(numHomePatches);
01120   for (int j=0;j < 2;j++)
01121     plList[j].resize(numHomePatches);
01122   for (int j=0;j < 2;j++)
01123     homePatchForceMsgs[j].resize(numHomePatches);
01124   // for (int j=0;j < 2;j++)
01125   //   numHomeAtoms[j].resize(numHomePatches);
01126   // If we have home patches, register this pencil with the neighbors and with self
01127   if (numHomePatches > 0) {
01128     for (int z=zlo;z <= zhi;z++) {
01129       for (int y=ylo;y <= yhi;y++) {
01130         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01131         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01132         int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01133         mgrProxy[node].registerNeighbor(yt, zt);
01134       }
01135     }
01136   }
01137 }
01138 
01139 void ComputePmeCUDADevice::registerNeighbor() {
01140   CmiLock(lock_numHomePatchesMerged);
01141   numNeighborsExpected++;
01142   CmiUnlock(lock_numHomePatchesMerged);
01143 }
01144 
01145 //
01146 // Recevice atoms from patch and file them into pencils
01147 //
01148 void ComputePmeCUDADevice::recvAtoms(PmeAtomMsg *msg) {
01149 
01150   PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
01151   // Store "virial" and "energy" flags
01152   doVirial = msg->doVirial;
01153   doEnergy = msg->doEnergy;
01154   // Get lattice
01155   SimParameters *simParams = Node::Object()->simParameters;
01156   Lattice lattice = simParams->lattice;
01157 
01158   // Primary pencil index
01159   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01160   int p0 = 0;
01161   int pencilPatchIndex[9];
01162   int numStrayAtomsPatch = 0;
01163   if (pmePencilType == 3) {
01164     // 3D box => store atoms directly without index
01165     // NOTE: We don't check for stray atoms here!
01166     pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01167   } else {
01168 
01169     // File atoms
01170     pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
01171       pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
01172 
01173     // Loop through pencils and add atoms to pencil atom lists
01174     // NOTE: we only store to neighboring pencil if there are atoms to store
01175     int numAtomsCheck = 0;
01176     for (int p=0;p < 9;p++) {
01177 
01178       int y = (p % 3);
01179       int z = (p / 3);
01180 
01181       int pp = y + z*yNBlocks;
01182       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01183       if (pp == pp0) p0 = p;
01184       if (pp == pp0 || numAtoms > 0) {
01185         if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
01186           NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
01187         int* index = pmeAtomFilerPtr->getAtomIndex(p);
01188         pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index);
01189         // Number of patches in this storage tells you how many home patches contributed and
01190         // homePatchIndex (pe) tells you which patch contributed
01191         numAtomsCheck += numAtoms;
01192       }
01193     }
01194 
01195     // Deal with stray atoms
01196     numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
01197     if (numStrayAtomsPatch > 0) {
01198       int* index = pmeAtomFilerPtr->getAtomIndex(9);
01199       CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
01200       for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
01201         int j = index[i];
01202         CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
01203       }
01204     }
01205 
01206     if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
01207       NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
01208   }
01209 
01210   // Create storage for home patch forces
01211   PmeForceMsg *forceMsg;
01212   if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
01213     // 3D FFT and compute resides on the same node => use zero-copy forces
01214     forceMsg = new (0, PRIORITY_SIZE) PmeForceMsg();
01215     forceMsg->zeroCopy = true;
01216   } else {
01217     forceMsg = new (msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
01218     forceMsg->zeroCopy = false;
01219   }
01220   forceMsg->numAtoms = msg->numAtoms;
01221   forceMsg->pe = msg->pe;
01222   forceMsg->compute = msg->compute;
01223   forceMsg->numStrayAtoms = numStrayAtomsPatch;
01224 
01225   bool done = false;
01226   // ----------------------------- lock start ---------------------------
01227   // Only after writing has finished, we get homePatchIndex
01228   // This quarantees that for whatever thread that receives "done=true", writing has finished on
01229   // ALL threads.
01230   CmiLock(lock_recvAtoms);
01231   numStrayAtoms += numStrayAtomsPatch;
01232   // Secure homePatchIndex. All writes after this must be inside lock-region
01233   int homePatchIndex = numHomePatchesRecv;
01234   // Store primary pencil first
01235   plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
01236   if (pmePencilType != 3) {
01237     // Go back to through neighboring pencils and store "homePatchIndex"
01238     for (int p=0;p < 9;p++) {
01239 
01240       int y = (p % 3);
01241       int z = (p / 3);
01242 
01243       int pp = y + z*yNBlocks;
01244       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01245       if (pp != pp0 && numAtoms > 0) {
01246         homePatchIndexList[atomI][pp].push_back(homePatchIndex);
01247         // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
01248         //                                sharing it
01249         // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
01250         plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
01251       }
01252     }
01253   }
01254   homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
01255   // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
01256   // Set the number of pencils contributing to this home patch
01257   numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
01258   //
01259   numHomePatchesRecv++;
01260   if (numHomePatchesRecv == numHomePatches) {
01261     // Reset counter
01262     numHomePatchesRecv = 0;
01263     done = true;
01264   }
01265   CmiUnlock(lock_recvAtoms);
01266   // ----------------------------- lock end  ---------------------------
01267 
01268   // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
01269 
01270   delete msg;
01271 
01272   if (done) {
01273     // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
01274     sendAtomsToNeighbors();
01275   }
01276 }
01277 
01278 //
01279 // Loop through pencils and send atoms to neighboring nodes
01280 //
01281 void ComputePmeCUDADevice::sendAtomsToNeighbors() {
01282   for (int z=zlo;z <= zhi;z++) {
01283     for (int y=ylo;y <= yhi;y++) {
01284       // Only send to neighbors, not self
01285       if (y != 0 || z != 0) {
01286         // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
01287         // before these sends have been performed
01288         thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
01289       }
01290     }
01291   }
01292   // Register primary pencil
01293   registerRecvAtomsFromNeighbor();
01294 }
01295 
01296 void ComputePmeCUDADevice::sendAtomsToNeighbor(int y, int z, int atomIval) {
01297   // Pencil index  
01298   int pp = y-ylo + (z-zlo)*yNBlocks;
01299   // This neighbor pencil is done, finish it up before accessing it
01300   pmeAtomStorage[atomIval][pp]->finish();
01301   // Compute destination neighbor pencil index (yt,zt)
01302   int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01303   int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01304   int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
01305   CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
01306   PmeAtomPencilMsg* msgPencil = new (numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
01307   memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
01308   msgPencil->numAtoms = numAtoms;
01309   // Store destination pencil index
01310   msgPencil->y = yt;
01311   msgPencil->z = zt;
01312   // Store source pencil index
01313   msgPencil->srcY = pencilIndexY;
01314   msgPencil->srcZ = pencilIndexZ;
01315   // Store energy and virial flags
01316   msgPencil->doEnergy = doEnergy;
01317   msgPencil->doVirial = doVirial;
01318   int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01319   mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
01320 }
01321 
01322 void ComputePmeCUDADevice::recvAtomsFromNeighbor(PmeAtomPencilMsg *msg) {
01323   // Store into primary pencil
01324   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01325   // Compute pencil index relative to primary pencil
01326   int y = msg->srcY - pencilIndexY;
01327   if (y < ylo) y += pmeGrid.yBlocks;
01328   if (y > yhi) y -= pmeGrid.yBlocks;
01329   int z = msg->srcZ - pencilIndexZ;
01330   if (z < zlo) z += pmeGrid.zBlocks;
01331   if (z > zhi) z -= pmeGrid.zBlocks;
01332   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01333     NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
01334   }
01335   // Read energy and virial flags
01336   doEnergy = msg->doEnergy;
01337   doVirial = msg->doVirial;
01338   // Pencil index where atoms came from
01339   int pp = y-ylo + (z-zlo)*yNBlocks;
01340   // Store atoms and mark down the patch index where these atoms were added
01341   neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01342 
01343   delete msg;
01344 
01345   registerRecvAtomsFromNeighbor();
01346 }
01347 
01348 void ComputePmeCUDADevice::registerRecvAtomsFromNeighbor() {
01349   // Primary pencil
01350   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01351 
01352   bool done = false;
01353   // ----------------------------- lock start ---------------------------
01354   CmiLock(lock_numNeighborsRecv);
01355   numNeighborsRecv++;
01356   if (numNeighborsRecv == numNeighborsExpected) {
01357     // Reset counter
01358     numNeighborsRecv = 0;
01359     done = true;
01360   }
01361   CmiUnlock(lock_numNeighborsRecv);
01362   // ----------------------------- lock end  ---------------------------
01363 
01364   if (done) {
01365     // Primary pencil has received all atoms and writing has finished => spread charge
01366     spreadCharge();
01367   }  
01368 }
01369 
01370 void ComputePmeCUDADevice::spreadCharge() {
01371   // Spread charges in primary pencil
01372   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01373   // Primary pencil is done, finish it up before accessing it
01374   // (clearing is done in mergeForcesOnPatch)
01375   pmeAtomStorage[atomI][pp0]->finish();
01376   // Get the number of atoms and pointer to atoms
01377   int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
01378   CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
01379   // Flip atomI <-> forceI
01380   std::swap(atomI, forceI);
01381   // Re-allocate force buffer if needed
01382   reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
01383   // Setup patches and atoms
01384   SimParameters *simParams = Node::Object()->simParameters;
01385   Lattice lattice = simParams->lattice;
01386   pmeRealSpaceCompute->copyAtoms(numAtoms, atoms);
01387   // Spread charge
01388   beforeWalltime = CmiWallTimer();
01389   pmeRealSpaceCompute->spreadCharge(lattice);
01390   // Send "charge grid ready to PME solver"
01391   PmeRunMsg *pmeRunMsg = new PmeRunMsg();
01392   pmeRunMsg->doVirial = doVirial;
01393   pmeRunMsg->doEnergy = doEnergy;
01394   pmeRunMsg->lattice = lattice;
01395   pmeRunMsg->numStrayAtoms = numStrayAtoms;
01396   // Reset stray atom counter
01397   numStrayAtoms = 0;
01398   switch(pmePencilType) {
01399     case 1:
01400     pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
01401     break;
01402     case 2:
01403     pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
01404     break;
01405     case 3:
01406     pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
01407     break;
01408   }
01409 }
01410 
01411 //
01412 // After PME solver is done, we return here
01413 //
01414 void ComputePmeCUDADevice::gatherForce() {
01415   traceUserBracketEvent(CUDA_PME_SPREADCHARGE_EVENT, beforeWalltime, CmiWallTimer());
01416   beforeWalltime = CmiWallTimer();
01417   // gather (i.e. un-grid) forces
01418   SimParameters *simParams = Node::Object()->simParameters;
01419   Lattice lattice = simParams->lattice;
01420   pmeRealSpaceCompute->gatherForce(lattice, force);
01421   // Set callback that will call gatherForceDone() once gatherForce is done
01422   ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->gatherForceSetCallback(this);
01423   // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
01424   // gatherForceDone();
01425 }
01426 
01427 static inline void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param) {
01428   ComputePmeCUDADevice* c = (ComputePmeCUDADevice *)param;
01429   c->gatherForceDoneSubset(first, last);
01430 }
01431 
01432 void ComputePmeCUDADevice::gatherForceDoneSubset(int first, int last) {
01433   for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
01434     bool done = false;
01435     // ----------------------------- lock start ---------------------------
01436     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01437     //       we really would only need to each element but this would required
01438     //       numHomePatches number of locks.
01439     if (pmePencilType != 3) CmiLock(lock_numPencils);
01440     numPencils[forceI][homePatchIndex]--;
01441     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01442     if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01443     // ----------------------------- lock end  ---------------------------
01444     if (done) {
01445       // This home patch is done, launch force merging
01446       mergeForcesOnPatch(homePatchIndex);
01447     }
01448   }
01449 }
01450 
01451 void ComputePmeCUDADevice::gatherForceDone() {
01452   // Primary pencil has the forces
01453 
01454   traceUserBracketEvent(CUDA_PME_GATHERFORCE_EVENT, beforeWalltime, CmiWallTimer());
01455 
01456   // Send forces to neighbors
01457   sendForcesToNeighbors();
01458 
01459 #if CMK_SMP && USE_CKLOOP
01460   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01461   if (useCkLoop >= 1) {
01462     CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
01463   } else
01464 #endif
01465 
01466   {
01467     // Loop through home patches and mark the primary pencil as "done"
01468     for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
01469       bool done = false;
01470       // ----------------------------- lock start ---------------------------
01471       // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01472       //       we really would only need to each element but this would required
01473       //       numHomePatches number of locks.
01474       if (pmePencilType != 3) CmiLock(lock_numPencils);
01475       numPencils[forceI][homePatchIndex]--;
01476       if (numPencils[forceI][homePatchIndex] == 0) done = true;
01477       if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01478       // ----------------------------- lock end  ---------------------------
01479       if (done) {
01480         // This home patch is done, launch force merging
01481         thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01482       }
01483     }
01484   }
01485 
01486   // In case we have no home patches, clear the primary pencil storage here
01487   if (numHomePatches == 0) {
01488     int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01489     pmeAtomStorage[forceI][pp0]->clear();
01490   }
01491 
01492 }
01493 
01494 //
01495 // After gatherForce is done, we end up here
01496 //
01497 void ComputePmeCUDADevice::sendForcesToNeighbors() {
01498   // Primary pencil has the forces
01499   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01500   int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01501   // Loop through neighboring pencils
01502   for (int z=zlo;z <= zhi;z++) {
01503     for (int y=ylo;y <= yhi;y++) {
01504       // Only send to neighbors, not self
01505       if (y != 0 || z != 0) {
01506         int pp = y-ylo + (z-zlo)*yNBlocks;
01507         int patchIndex = neighborPatchIndex[pp];
01508         int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
01509         int atomEnd   = patchPos[patchIndex];
01510         int natom = atomEnd-atomStart;
01511         // copy forces
01512         PmeForcePencilMsg *msg = new (natom, PRIORITY_SIZE) PmeForcePencilMsg;
01513         msg->numAtoms = natom;
01514         memcpy(msg->force, force+atomStart, natom*sizeof(CudaForce));
01515         // Calculate destination pencil index (dstY, dstZ) for this neighbor
01516         int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01517         int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01518         int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
01519         msg->y = dstY;
01520         msg->z = dstZ;
01521         // Store source pencil index
01522         msg->srcY = pencilIndexY;
01523         msg->srcZ = pencilIndexZ;
01524         mgrProxy[node].recvForcesFromNeighbor(msg);
01525       }
01526     }
01527   }
01528 }
01529 
01530 void ComputePmeCUDADevice::recvForcesFromNeighbor(PmeForcePencilMsg *msg) {
01531 
01532   // Source pencil index
01533   int y = msg->srcY - pencilIndexY;
01534   if (y < ylo) y += pmeGrid.yBlocks;
01535   if (y > yhi) y -= pmeGrid.yBlocks;
01536   int z = msg->srcZ - pencilIndexZ;
01537   if (z < zlo) z += pmeGrid.zBlocks;
01538   if (z > zhi) z -= pmeGrid.zBlocks;
01539 
01540   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01541     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
01542   }
01543 
01544   // Source pencil
01545   int pp = y-ylo + (z-zlo)*yNBlocks;
01546 
01547   // Store message (deleted in mergeForcesOnPatch)
01548   neighborForcePencilMsgs[pp] = msg;
01549 
01550   // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
01551   // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
01552   // neighborForcePencils[pp].numAtoms = msg->numAtoms;
01553   // neighborForcePencils[pp].y = msg->y;
01554   // neighborForcePencils[pp].z = msg->z;
01555   // neighborForcePencils[pp].srcY = msg->srcY;
01556   // neighborForcePencils[pp].srcZ = msg->srcZ;
01557   // delete msg;
01558 
01559   // numPatches = number of home patches this pencil has
01560   int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
01561   if (numPatches != homePatchIndexList[forceI][pp].size()) {
01562     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
01563   }
01564   for (int i=0;i < numPatches;i++) {
01565     // this pencil contributed to home patch with index "homePatchIndex"
01566     int homePatchIndex = homePatchIndexList[forceI][pp][i];
01567     // ----------------------------- lock start ---------------------------
01568     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01569     //       we really would only need to each element but this would required
01570     //       numHomePatches number of locks.
01571     bool done = false;
01572     CmiLock(lock_numPencils);
01573     numPencils[forceI][homePatchIndex]--;
01574     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01575     CmiUnlock(lock_numPencils);
01576     // ----------------------------- lock end  ---------------------------
01577     if (done) {
01578       // This home patch is done, launch force merging
01579       thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01580     }
01581   }
01582 
01583 }
01584 
01585 void ComputePmeCUDADevice::mergeForcesOnPatch(int homePatchIndex) {
01586   // We have all the forces for this patch => merge on a single Pe
01587 
01588   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01589 
01590   // Message that goes out to the compute
01591   PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
01592 
01593   if (pmePencilType == 3) {
01594     // 3D box => simple memory copy will do
01595     // Location of forces in the force[] array
01596     int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01597     // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
01598     int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01599     int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01600     int atomEnd   = patchPos[pencilPatchIndex];
01601     int numAtoms = atomEnd-atomStart;
01602     if (forceMsg->zeroCopy) {
01603       // Zero-copy, just pass the pointer
01604       forceMsg->force = force+atomStart;
01605     } else {
01606       memcpy(forceMsg->force, force+atomStart, numAtoms*sizeof(CudaForce));
01607     }
01608   } else {
01609 
01610     // Zero force array
01611     // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
01612     memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
01613 
01614     // Store forces from primary pencil
01615     {
01616       int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01617       int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
01618       int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01619       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01620       int atomEnd   = patchPos[pencilPatchIndex];
01621       int numAtoms = atomEnd-atomStart;
01622 
01623       // Copy in local forces that are stored in the force[] array
01624       for (int i=0;i < numAtoms;i++) {
01625         forceMsg->force[index[atomStart + i]] = force[atomStart + i];
01626       }
01627 
01628     }
01629 
01630     // Add forces from neighboring pencils
01631     for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
01632       int pp               = plList[forceI][homePatchIndex][j].pp;
01633       int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
01634 
01635       int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
01636       int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
01637       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01638       int atomEnd   = patchPos[pencilPatchIndex];
01639       int numAtoms = atomEnd-atomStart;
01640       CudaForce *dstForce = forceMsg->force;
01641       // CudaForce *srcForce = neighborForcePencils[pp].force;
01642       CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
01643 
01644       for (int i=0;i < numAtoms;i++) {
01645         dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
01646         dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
01647         dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
01648       }
01649 
01650     }
01651   }
01652 
01653   // Clear storage
01654   plList[forceI][homePatchIndex].clear();
01655 
01656   // ----------------------------- lock start ---------------------------
01657   // bool done = false;
01658   CmiLock(lock_numHomePatchesMerged);
01659   numHomePatchesMerged++;
01660   if (numHomePatchesMerged == numHomePatches) {
01661     // Reset counter
01662     numHomePatchesMerged = 0;
01663 
01664     // Delete messages
01665     for (int i=0;i < neighborForcePencilMsgs.size();i++) {
01666       if (neighborForcePencilMsgs[i] != NULL) {
01667         delete neighborForcePencilMsgs[i];
01668         neighborForcePencilMsgs[i] = NULL;
01669       }
01670     }
01671 
01672     // Done merging and sending forces => clear storage
01673     for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
01674       homePatchIndexList[forceI][pp].clear();
01675     for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
01676       pmeAtomStorage[forceI][pp]->clear();
01677 
01678   }
01679   CmiUnlock(lock_numHomePatchesMerged);
01680   // ----------------------------- lock end  ---------------------------
01681 
01682   // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
01683   // this node will then rely the message to the Pe that originally sent the atoms
01684   int pe = forceMsg->pe;
01685   if (CkNodeOf(pe) != CkMyNode())
01686     thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
01687   else
01688     sendForcesToPatch(forceMsg);
01689 
01690 }
01691 
01692 void ComputePmeCUDADevice::sendForcesToPatch(PmeForceMsg *forceMsg) {
01693   // Now we're on the node that has Pe, hence "compute" -pointer is valid
01694   int pe                  = forceMsg->pe;
01695   ComputePmeCUDA *compute = forceMsg->compute;
01696 
01697   // Store message for use in ComputePmeCUDA, where it'll also be deleted.
01698   if (compute->storePmeForceMsg(forceMsg)) {
01699     // Enqueue on the pe that sent the atoms in the first place
01700     LocalWorkMsg *lmsg = compute->localWorkMsg;
01701     CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
01702     wdProxy[pe].enqueuePme(lmsg);
01703   }
01704 }
01705 
01706 //
01707 // Received self energy from ComputePmeCUDA computes
01708 // NOTE: This should only happen once at the start of the simulation since self energy is constant
01709 //
01710 void ComputePmeCUDAMgr::recvSelfEnergy(PmeSelfEnergyMsg *msg) {
01711   // NOTE: msg is deleted in PmePencilXYZ / PmePencilX
01712   switch(pmePencilType) {
01713     case 1:
01714     pmePencilZ(0,0,0).recvSelfEnergy(msg);
01715     break;
01716     case 2:
01717     pmePencilZ(0,0,0).recvSelfEnergy(msg);
01718     break;
01719     case 3:
01720     pmePencilXYZ[0].recvSelfEnergy(msg);
01721     break;
01722   }  
01723 }
01724 #endif // NAMD_CUDA
01725 
01726 #include "ComputePmeCUDAMgr.def.h"

Generated on Sun Feb 25 01:17:14 2018 for NAMD by  doxygen 1.4.7