ComputePmeCUDADevice Class Reference

#include <ComputePmeCUDAMgr.h>

List of all members.

Public Member Functions

 ComputePmeCUDADevice ()
 ComputePmeCUDADevice (CkMigrateMessage *m)
 ~ComputePmeCUDADevice ()
void initialize (PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
int getDeviceID ()
cudaStream_t getStream ()
CProxy_ComputePmeCUDAMgr getMgrProxy ()
void setPencilProxy (CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void setPencilProxy (CProxy_CudaPmePencilXY pmePencilXY_in)
void setPencilProxy (CProxy_CudaPmePencilX pmePencilX_in)
void activate_pencils ()
void initializePatches (int numHomePatches_in)
void registerNeighbor ()
void recvAtoms (PmeAtomMsg *msg)
void sendAtomsToNeighbors ()
void sendAtomsToNeighbor (int y, int z, int atomIval)
void recvAtomsFromNeighbor (PmeAtomPencilMsg *msg)
void registerRecvAtomsFromNeighbor ()
void spreadCharge ()
void gatherForce ()
void gatherForceDone ()
void sendForcesToNeighbors ()
void recvForcesFromNeighbor (PmeForcePencilMsg *msg)
void mergeForcesOnPatch (int homePatchIndex)
void sendForcesToPatch (PmeForceMsg *forceMsg)
void gatherForceDoneSubset (int first, int last)

Classes

struct  PencilLocation


Detailed Description

Definition at line 291 of file ComputePmeCUDAMgr.h.


Constructor & Destructor Documentation

ComputePmeCUDADevice::ComputePmeCUDADevice (  ) 

Definition at line 938 of file ComputePmeCUDAMgr.C.

00938                                            {
00939   // __sdag_init();
00940   numHomePatches = 0;
00941   forceCapacity = 0;
00942   force = NULL;
00943   pmeRealSpaceCompute = NULL;
00944   streamCreated = false;
00945   lock_numHomePatchesMerged = CmiCreateLock();
00946   lock_numPencils = CmiCreateLock();
00947   lock_numNeighborsRecv = CmiCreateLock();
00948   lock_recvAtoms = CmiCreateLock();
00949   numNeighborsExpected = 0;
00950   numStrayAtoms = 0;
00951   // Reset counters
00952   numNeighborsRecv = 0;
00953   numHomePatchesRecv = 0;
00954   numHomePatchesMerged = 0;
00955   atomI = 0;
00956   forceI = 1;
00957 }

ComputePmeCUDADevice::ComputePmeCUDADevice ( CkMigrateMessage *  m  ) 

Definition at line 959 of file ComputePmeCUDAMgr.C.

00959                                                               {
00960   // __sdag_init();
00961   numHomePatches = 0;
00962   forceCapacity = 0;
00963   force = NULL;
00964   pmeRealSpaceCompute = NULL;
00965   streamCreated = false;
00966   lock_numHomePatchesMerged = CmiCreateLock();
00967   lock_numPencils = CmiCreateLock();
00968   lock_numNeighborsRecv = CmiCreateLock();
00969   lock_recvAtoms = CmiCreateLock();
00970   numNeighborsExpected = 0;
00971   numStrayAtoms = 0;
00972   // Reset counters
00973   numNeighborsRecv = 0;
00974   numHomePatchesRecv = 0;
00975   numHomePatchesMerged = 0;
00976   atomI = 0;
00977   forceI = 1;
00978 }

ComputePmeCUDADevice::~ComputePmeCUDADevice (  ) 

Definition at line 980 of file ComputePmeCUDAMgr.C.

References cudaCheck, and j.

00980                                             {
00981   if (streamCreated) {
00982     cudaCheck(cudaSetDevice(deviceID));
00983     cudaCheck(cudaStreamDestroy(stream));
00984   }
00985   for (int j=0;j < 2;j++)
00986     for (int i=0;i < pmeAtomStorage[j].size();i++) {
00987       if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
00988     }
00989   if (force != NULL) deallocate_host<CudaForce>(&force);
00990   if (pmeRealSpaceCompute != NULL) delete pmeRealSpaceCompute;
00991   CmiDestroyLock(lock_numHomePatchesMerged);
00992   CmiDestroyLock(lock_numPencils);
00993   CmiDestroyLock(lock_numNeighborsRecv);
00994   CmiDestroyLock(lock_recvAtoms);
00995 }


Member Function Documentation

void ComputePmeCUDADevice::activate_pencils (  ) 

Definition at line 1102 of file ComputePmeCUDAMgr.C.

References PmeStartMsg::data, PmeStartMsg::dataSize, PmeRealSpaceCompute::getData(), and PmeRealSpaceCompute::getDataSize().

01102                                             {
01103   if (pmePencilType == 1) {
01104     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01105     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01106     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01107     pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
01108   } else if (pmePencilType == 2) {
01109     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01110     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01111     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01112     pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
01113   } else if (pmePencilType == 3) {
01114     PmeStartMsg* pmeStartMsg = new PmeStartMsg();
01115     pmeStartMsg->data = pmeRealSpaceCompute->getData();
01116     pmeStartMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01117     pmePencilXYZ[0].start(pmeStartMsg);
01118   }
01119 }

void ComputePmeCUDADevice::gatherForce (  ) 

Definition at line 1419 of file ComputePmeCUDAMgr.C.

References CUDA_EVENT_SPREADCHARGE, PmeRealSpaceCompute::gatherForce(), Node::Object(), Node::simParameters, and simParams.

01419                                        {
01420   traceUserBracketEvent(CUDA_EVENT_SPREADCHARGE, beforeWalltime, CmiWallTimer());
01421   beforeWalltime = CmiWallTimer();
01422   // gather (i.e. un-grid) forces
01423   SimParameters *simParams = Node::Object()->simParameters;
01424   Lattice lattice = simParams->lattice;
01425   pmeRealSpaceCompute->gatherForce(lattice, force);
01426   // Set callback that will call gatherForceDone() once gatherForce is done
01427   ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->gatherForceSetCallback(this);
01428   // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
01429   // gatherForceDone();
01430 }

void ComputePmeCUDADevice::gatherForceDone (  ) 

Definition at line 1456 of file ComputePmeCUDAMgr.C.

References CUDA_EVENT_GATHERFORCE, gatherForceDoneLoop(), mergeForcesOnPatch(), Node::Object(), sendForcesToNeighbors(), Node::simParameters, and SimParameters::useCkLoop.

01456                                            {
01457   // Primary pencil has the forces
01458 
01459   traceUserBracketEvent(CUDA_EVENT_GATHERFORCE, beforeWalltime, CmiWallTimer());
01460 
01461   // Send forces to neighbors
01462   sendForcesToNeighbors();
01463 
01464 #if CMK_SMP && USE_CKLOOP
01465   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01466   if (useCkLoop >= 1) {
01467     CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
01468   } else
01469 #endif
01470 
01471   {
01472     // Loop through home patches and mark the primary pencil as "done"
01473     for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
01474       bool done = false;
01475       // ----------------------------- lock start ---------------------------
01476       // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01477       //       we really would only need to each element but this would required
01478       //       numHomePatches number of locks.
01479       if (pmePencilType != 3) CmiLock(lock_numPencils);
01480       numPencils[forceI][homePatchIndex]--;
01481       if (numPencils[forceI][homePatchIndex] == 0) done = true;
01482       if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01483       // ----------------------------- lock end  ---------------------------
01484       if (done) {
01485         // This home patch is done, launch force merging
01486         thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01487       }
01488     }
01489   }
01490 
01491   // In case we have no home patches, clear the primary pencil storage here
01492   if (numHomePatches == 0) {
01493     int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01494     pmeAtomStorage[forceI][pp0]->clear();
01495   }
01496 
01497 }

void ComputePmeCUDADevice::gatherForceDoneSubset ( int  first,
int  last 
)

Definition at line 1437 of file ComputePmeCUDAMgr.C.

References mergeForcesOnPatch().

Referenced by gatherForceDoneLoop().

01437                                                                     {
01438   for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
01439     bool done = false;
01440     // ----------------------------- lock start ---------------------------
01441     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01442     //       we really would only need to each element but this would required
01443     //       numHomePatches number of locks.
01444     if (pmePencilType != 3) CmiLock(lock_numPencils);
01445     numPencils[forceI][homePatchIndex]--;
01446     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01447     if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01448     // ----------------------------- lock end  ---------------------------
01449     if (done) {
01450       // This home patch is done, launch force merging
01451       mergeForcesOnPatch(homePatchIndex);
01452     }
01453   }
01454 }

int ComputePmeCUDADevice::getDeviceID (  ) 

Definition at line 1076 of file ComputePmeCUDAMgr.C.

01076                                       {
01077   return deviceID;
01078 }

CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy (  ) 

Definition at line 1080 of file ComputePmeCUDAMgr.C.

01080                                                            {
01081   return mgrProxy;
01082 }

cudaStream_t ComputePmeCUDADevice::getStream (  ) 

Definition at line 1072 of file ComputePmeCUDAMgr.C.

01072                                              {
01073   return stream;
01074 }

void ComputePmeCUDADevice::initialize ( PmeGrid pmeGrid_in,
int  pencilIndexY_in,
int  pencilIndexZ_in,
int  deviceID_in,
int  pmePencilType_in,
CProxy_ComputePmeCUDAMgr  mgrProxy_in,
CProxy_PmeAtomFiler  pmeAtomFiler_in 
)

Definition at line 997 of file ComputePmeCUDAMgr.C.

References createStream(), CUDA_EVENT_GATHERFORCE, CUDA_EVENT_SPREADCHARGE, cudaCheck, if(), j, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

00999                                        {
01000 
01001 #define CUDA_EVENT_SPREADCHARGE 90
01002   traceRegisterUserEvent("CUDA spreadCharge", CUDA_EVENT_SPREADCHARGE);
01003 #define CUDA_EVENT_GATHERFORCE 91
01004   traceRegisterUserEvent("CUDA gatherForce", CUDA_EVENT_GATHERFORCE);
01005 
01006   deviceID = deviceID_in;
01007   cudaCheck(cudaSetDevice(deviceID));
01008   pmePencilType = pmePencilType_in;
01009   pmeGrid = pmeGrid_in;
01010   pencilIndexY = pencilIndexY_in;
01011   pencilIndexZ = pencilIndexZ_in;
01012   mgrProxy = mgrProxy_in;
01013   pmeAtomFiler = pmeAtomFiler_in;
01014   // Size of the neighboring pencil grid, max 3x3
01015   yNBlocks = std::min(pmeGrid.yBlocks, 3);
01016   zNBlocks = std::min(pmeGrid.zBlocks, 3);
01017   // Local pencil is at y=0,z=0
01018   if (yNBlocks == 1) {
01019     ylo = 0;
01020     yhi = 0;
01021   } else if (yNBlocks == 2) {
01022     ylo = -1;
01023     yhi = 0;
01024   } else {
01025     ylo = -1;
01026     yhi = 1;
01027   }
01028   if (zNBlocks == 1) {
01029     zlo = 0;
01030     zhi = 0;
01031   } else if (zNBlocks == 2) {
01032     zlo = -1;
01033     zhi = 0;
01034   } else {
01035     zlo = -1;
01036     zhi = 1;
01037   }
01038   
01039   neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
01040   // neighborForcePencils.resize(yNBlocks*zNBlocks);
01041   for (int j=0;j < 2;j++)
01042     homePatchIndexList[j].resize(yNBlocks*zNBlocks);
01043   neighborPatchIndex.resize(yNBlocks*zNBlocks);
01044 
01045   pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
01046   for (int j=0;j < 2;j++) {
01047     pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
01048     for (int z=zlo;z <= zhi;z++) {
01049       for (int y=ylo;y <= yhi;y++) {
01050         int pp = y-ylo + (z-zlo)*yNBlocks;
01051         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01052         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01053         if (y == 0 && z == 0) {
01054           // Primary pencil
01055           pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
01056         } else {
01057           pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
01058         }
01059         pmeAtomStorageAllocatedHere[pp] = true;
01060       }
01061     }
01062   }
01063 
01064   // Create stream for this device
01065   createStream(stream);
01066   streamCreated = true;
01067   pmeRealSpaceCompute = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ,
01068     deviceID, stream);
01069 
01070 }

void ComputePmeCUDADevice::initializePatches ( int  numHomePatches_in  ) 

Definition at line 1121 of file ComputePmeCUDAMgr.C.

References j, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

01121                                                                   {
01122   numHomePatches = numHomePatches_in;
01123   for (int j=0;j < 2;j++)
01124     numPencils[j].resize(numHomePatches);
01125   for (int j=0;j < 2;j++)
01126     plList[j].resize(numHomePatches);
01127   for (int j=0;j < 2;j++)
01128     homePatchForceMsgs[j].resize(numHomePatches);
01129   // for (int j=0;j < 2;j++)
01130   //   numHomeAtoms[j].resize(numHomePatches);
01131   // If we have home patches, register this pencil with the neighbors and with self
01132   if (numHomePatches > 0) {
01133     for (int z=zlo;z <= zhi;z++) {
01134       for (int y=ylo;y <= yhi;y++) {
01135         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01136         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01137         int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01138         mgrProxy[node].registerNeighbor(yt, zt);
01139       }
01140     }
01141   }
01142 }

void ComputePmeCUDADevice::mergeForcesOnPatch ( int  homePatchIndex  ) 

Definition at line 1590 of file ComputePmeCUDAMgr.C.

References j, sendForcesToPatch(), x, CudaForce::y, and CudaForce::z.

Referenced by gatherForceDone(), and gatherForceDoneSubset().

01590                                                                 {
01591   // We have all the forces for this patch => merge on a single Pe
01592 
01593   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01594 
01595   // Message that goes out to the compute
01596   PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
01597 
01598   if (pmePencilType == 3) {
01599     // 3D box => simple memory copy will do
01600     // Location of forces in the force[] array
01601     int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01602     // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
01603     int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01604     int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01605     int atomEnd   = patchPos[pencilPatchIndex];
01606     int numAtoms = atomEnd-atomStart;
01607     if (forceMsg->zeroCopy) {
01608       // Zero-copy, just pass the pointer
01609       forceMsg->force = force+atomStart;
01610     } else {
01611       memcpy(forceMsg->force, force+atomStart, numAtoms*sizeof(CudaForce));
01612     }
01613   } else {
01614 
01615     // Zero force array
01616     // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
01617     memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
01618 
01619     // Store forces from primary pencil
01620     {
01621       int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01622       int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
01623       int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01624       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01625       int atomEnd   = patchPos[pencilPatchIndex];
01626       int numAtoms = atomEnd-atomStart;
01627 
01628       // Copy in local forces that are stored in the force[] array
01629       for (int i=0;i < numAtoms;i++) {
01630         forceMsg->force[index[atomStart + i]] = force[atomStart + i];
01631       }
01632 
01633     }
01634 
01635     // Add forces from neighboring pencils
01636     for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
01637       int pp               = plList[forceI][homePatchIndex][j].pp;
01638       int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
01639 
01640       int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
01641       int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
01642       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01643       int atomEnd   = patchPos[pencilPatchIndex];
01644       int numAtoms = atomEnd-atomStart;
01645       CudaForce *dstForce = forceMsg->force;
01646       // CudaForce *srcForce = neighborForcePencils[pp].force;
01647       CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
01648 
01649       for (int i=0;i < numAtoms;i++) {
01650         dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
01651         dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
01652         dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
01653       }
01654 
01655     }
01656   }
01657 
01658   // Clear storage
01659   plList[forceI][homePatchIndex].clear();
01660 
01661   // ----------------------------- lock start ---------------------------
01662   // bool done = false;
01663   CmiLock(lock_numHomePatchesMerged);
01664   numHomePatchesMerged++;
01665   if (numHomePatchesMerged == numHomePatches) {
01666     // Reset counter
01667     numHomePatchesMerged = 0;
01668 
01669     // Delete messages
01670     for (int i=0;i < neighborForcePencilMsgs.size();i++) {
01671       if (neighborForcePencilMsgs[i] != NULL) {
01672         delete neighborForcePencilMsgs[i];
01673         neighborForcePencilMsgs[i] = NULL;
01674       }
01675     }
01676 
01677     // Done merging and sending forces => clear storage
01678     for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
01679       homePatchIndexList[forceI][pp].clear();
01680     for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
01681       pmeAtomStorage[forceI][pp]->clear();
01682 
01683   }
01684   CmiUnlock(lock_numHomePatchesMerged);
01685   // ----------------------------- lock end  ---------------------------
01686 
01687   // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
01688   // this node will then rely the message to the Pe that originally sent the atoms
01689   int pe = forceMsg->pe;
01690   if (CkNodeOf(pe) != CkMyNode())
01691     thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
01692   else
01693     sendForcesToPatch(forceMsg);
01694 
01695 }

void ComputePmeCUDADevice::recvAtoms ( PmeAtomMsg msg  ) 

Definition at line 1153 of file ComputePmeCUDAMgr.C.

References PmeAtomMsg::atoms, PmeAtomMsg::compute, PmeForceMsg::compute, PmeAtomMsg::doEnergy, PmeAtomMsg::doVirial, PmeAtomFiler::fileAtoms(), PmeAtomFiler::getAtomIndex(), PmeAtomFiler::getNumAtoms(), j, NAMD_bug(), PmeForceMsg::numAtoms, PmeAtomMsg::numAtoms, PmeForceMsg::numStrayAtoms, Node::Object(), PmeForceMsg::pe, PmeAtomMsg::pe, PRIORITY_SIZE, sendAtomsToNeighbors(), Node::simParameters, simParams, y, PmeGrid::yBlocks, z, PmeGrid::zBlocks, and PmeForceMsg::zeroCopy.

01153                                                     {
01154 
01155   PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
01156   // Store "virial" and "energy" flags
01157   doVirial = msg->doVirial;
01158   doEnergy = msg->doEnergy;
01159   // Get lattice
01160   SimParameters *simParams = Node::Object()->simParameters;
01161   Lattice lattice = simParams->lattice;
01162 
01163   // Primary pencil index
01164   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01165   int p0 = 0;
01166   int pencilPatchIndex[9];
01167   int numStrayAtomsPatch = 0;
01168   if (pmePencilType == 3) {
01169     // 3D box => store atoms directly without index
01170     // NOTE: We don't check for stray atoms here!
01171     pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01172   } else {
01173 
01174     // File atoms
01175     pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
01176       pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
01177 
01178     // Loop through pencils and add atoms to pencil atom lists
01179     // NOTE: we only store to neighboring pencil if there are atoms to store
01180     int numAtomsCheck = 0;
01181     for (int p=0;p < 9;p++) {
01182 
01183       int y = (p % 3);
01184       int z = (p / 3);
01185 
01186       int pp = y + z*yNBlocks;
01187       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01188       if (pp == pp0) p0 = p;
01189       if (pp == pp0 || numAtoms > 0) {
01190         if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
01191           NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
01192         int* index = pmeAtomFilerPtr->getAtomIndex(p);
01193         pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index);
01194         // Number of patches in this storage tells you how many home patches contributed and
01195         // homePatchIndex (pe) tells you which patch contributed
01196         numAtomsCheck += numAtoms;
01197       }
01198     }
01199 
01200     // Deal with stray atoms
01201     numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
01202     if (numStrayAtomsPatch > 0) {
01203       int* index = pmeAtomFilerPtr->getAtomIndex(9);
01204       CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
01205       for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
01206         int j = index[i];
01207         CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
01208       }
01209     }
01210 
01211     if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
01212       NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
01213   }
01214 
01215   // Create storage for home patch forces
01216   PmeForceMsg *forceMsg;
01217   if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
01218     // 3D FFT and compute resides on the same node => use zero-copy forces
01219     forceMsg = new (0, PRIORITY_SIZE) PmeForceMsg();
01220     forceMsg->zeroCopy = true;
01221   } else {
01222     forceMsg = new (msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
01223     forceMsg->zeroCopy = false;
01224   }
01225   forceMsg->numAtoms = msg->numAtoms;
01226   forceMsg->pe = msg->pe;
01227   forceMsg->compute = msg->compute;
01228   forceMsg->numStrayAtoms = numStrayAtomsPatch;
01229 
01230   bool done = false;
01231   // ----------------------------- lock start ---------------------------
01232   // Only after writing has finished, we get homePatchIndex
01233   // This quarantees that for whatever thread that receives "done=true", writing has finished on
01234   // ALL threads.
01235   CmiLock(lock_recvAtoms);
01236   numStrayAtoms += numStrayAtomsPatch;
01237   // Secure homePatchIndex. All writes after this must be inside lock-region
01238   int homePatchIndex = numHomePatchesRecv;
01239   // Store primary pencil first
01240   plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
01241   if (pmePencilType != 3) {
01242     // Go back to through neighboring pencils and store "homePatchIndex"
01243     for (int p=0;p < 9;p++) {
01244 
01245       int y = (p % 3);
01246       int z = (p / 3);
01247 
01248       int pp = y + z*yNBlocks;
01249       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01250       if (pp != pp0 && numAtoms > 0) {
01251         homePatchIndexList[atomI][pp].push_back(homePatchIndex);
01252         // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
01253         //                                sharing it
01254         // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
01255         plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
01256       }
01257     }
01258   }
01259   homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
01260   // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
01261   // Set the number of pencils contributing to this home patch
01262   numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
01263   //
01264   numHomePatchesRecv++;
01265   if (numHomePatchesRecv == numHomePatches) {
01266     // Reset counter
01267     numHomePatchesRecv = 0;
01268     done = true;
01269   }
01270   CmiUnlock(lock_recvAtoms);
01271   // ----------------------------- lock end  ---------------------------
01272 
01273   // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
01274 
01275   delete msg;
01276 
01277   if (done) {
01278     // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
01279     sendAtomsToNeighbors();
01280   }
01281 }

void ComputePmeCUDADevice::recvAtomsFromNeighbor ( PmeAtomPencilMsg msg  ) 

Definition at line 1327 of file ComputePmeCUDAMgr.C.

References PmeAtomPencilMsg::atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, NAMD_bug(), PmeAtomPencilMsg::numAtoms, registerRecvAtomsFromNeighbor(), PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

01327                                                                       {
01328   // Store into primary pencil
01329   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01330   // Compute pencil index relative to primary pencil
01331   int y = msg->srcY - pencilIndexY;
01332   if (y < ylo) y += pmeGrid.yBlocks;
01333   if (y > yhi) y -= pmeGrid.yBlocks;
01334   int z = msg->srcZ - pencilIndexZ;
01335   if (z < zlo) z += pmeGrid.zBlocks;
01336   if (z > zhi) z -= pmeGrid.zBlocks;
01337   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01338     NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
01339   }
01340   // Read energy and virial flags
01341   doEnergy = msg->doEnergy;
01342   doVirial = msg->doVirial;
01343   // Pencil index where atoms came from
01344   int pp = y-ylo + (z-zlo)*yNBlocks;
01345   // Store atoms and mark down the patch index where these atoms were added
01346   neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01347 
01348   delete msg;
01349 
01350   registerRecvAtomsFromNeighbor();
01351 }

void ComputePmeCUDADevice::recvForcesFromNeighbor ( PmeForcePencilMsg msg  ) 

Definition at line 1535 of file ComputePmeCUDAMgr.C.

References NAMD_bug(), PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

01535                                                                         {
01536 
01537   // Source pencil index
01538   int y = msg->srcY - pencilIndexY;
01539   if (y < ylo) y += pmeGrid.yBlocks;
01540   if (y > yhi) y -= pmeGrid.yBlocks;
01541   int z = msg->srcZ - pencilIndexZ;
01542   if (z < zlo) z += pmeGrid.zBlocks;
01543   if (z > zhi) z -= pmeGrid.zBlocks;
01544 
01545   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01546     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
01547   }
01548 
01549   // Source pencil
01550   int pp = y-ylo + (z-zlo)*yNBlocks;
01551 
01552   // Store message (deleted in mergeForcesOnPatch)
01553   neighborForcePencilMsgs[pp] = msg;
01554 
01555   // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
01556   // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
01557   // neighborForcePencils[pp].numAtoms = msg->numAtoms;
01558   // neighborForcePencils[pp].y = msg->y;
01559   // neighborForcePencils[pp].z = msg->z;
01560   // neighborForcePencils[pp].srcY = msg->srcY;
01561   // neighborForcePencils[pp].srcZ = msg->srcZ;
01562   // delete msg;
01563 
01564   // numPatches = number of home patches this pencil has
01565   int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
01566   if (numPatches != homePatchIndexList[forceI][pp].size()) {
01567     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
01568   }
01569   for (int i=0;i < numPatches;i++) {
01570     // this pencil contributed to home patch with index "homePatchIndex"
01571     int homePatchIndex = homePatchIndexList[forceI][pp][i];
01572     // ----------------------------- lock start ---------------------------
01573     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01574     //       we really would only need to each element but this would required
01575     //       numHomePatches number of locks.
01576     bool done = false;
01577     CmiLock(lock_numPencils);
01578     numPencils[forceI][homePatchIndex]--;
01579     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01580     CmiUnlock(lock_numPencils);
01581     // ----------------------------- lock end  ---------------------------
01582     if (done) {
01583       // This home patch is done, launch force merging
01584       thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01585     }
01586   }
01587 
01588 }

void ComputePmeCUDADevice::registerNeighbor (  ) 

Definition at line 1144 of file ComputePmeCUDAMgr.C.

01144                                             {
01145   CmiLock(lock_numHomePatchesMerged);
01146   numNeighborsExpected++;
01147   CmiUnlock(lock_numHomePatchesMerged);
01148 }

void ComputePmeCUDADevice::registerRecvAtomsFromNeighbor (  ) 

Definition at line 1353 of file ComputePmeCUDAMgr.C.

References spreadCharge().

Referenced by recvAtomsFromNeighbor(), and sendAtomsToNeighbors().

01353                                                          {
01354   // Primary pencil
01355   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01356 
01357   bool done = false;
01358   // ----------------------------- lock start ---------------------------
01359   CmiLock(lock_numNeighborsRecv);
01360   numNeighborsRecv++;
01361   if (numNeighborsRecv == numNeighborsExpected) {
01362     // Reset counter
01363     numNeighborsRecv = 0;
01364     done = true;
01365   }
01366   CmiUnlock(lock_numNeighborsRecv);
01367   // ----------------------------- lock end  ---------------------------
01368 
01369   if (done) {
01370     // Primary pencil has received all atoms and writing has finished => spread charge
01371     spreadCharge();
01372   }  
01373 }

void ComputePmeCUDADevice::sendAtomsToNeighbor ( int  y,
int  z,
int  atomIval 
)

Definition at line 1301 of file ComputePmeCUDAMgr.C.

References PmeAtomPencilMsg::atoms, atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, PmeAtomPencilMsg::numAtoms, PRIORITY_SIZE, PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, PmeAtomPencilMsg::y, PmeGrid::yBlocks, PmeAtomPencilMsg::z, and PmeGrid::zBlocks.

Referenced by sendAtomsToNeighbors().

01301                                                                          {
01302   // Pencil index  
01303   int pp = y-ylo + (z-zlo)*yNBlocks;
01304   // This neighbor pencil is done, finish it up before accessing it
01305   pmeAtomStorage[atomIval][pp]->finish();
01306   // Compute destination neighbor pencil index (yt,zt)
01307   int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01308   int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01309   int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
01310   CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
01311   PmeAtomPencilMsg* msgPencil = new (numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
01312   memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
01313   msgPencil->numAtoms = numAtoms;
01314   // Store destination pencil index
01315   msgPencil->y = yt;
01316   msgPencil->z = zt;
01317   // Store source pencil index
01318   msgPencil->srcY = pencilIndexY;
01319   msgPencil->srcZ = pencilIndexZ;
01320   // Store energy and virial flags
01321   msgPencil->doEnergy = doEnergy;
01322   msgPencil->doVirial = doVirial;
01323   int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01324   mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
01325 }

void ComputePmeCUDADevice::sendAtomsToNeighbors (  ) 

Definition at line 1286 of file ComputePmeCUDAMgr.C.

References registerRecvAtomsFromNeighbor(), sendAtomsToNeighbor(), y, and z.

Referenced by recvAtoms().

01286                                                 {
01287   for (int z=zlo;z <= zhi;z++) {
01288     for (int y=ylo;y <= yhi;y++) {
01289       // Only send to neighbors, not self
01290       if (y != 0 || z != 0) {
01291         // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
01292         // before these sends have been performed
01293         thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
01294       }
01295     }
01296   }
01297   // Register primary pencil
01298   registerRecvAtomsFromNeighbor();
01299 }

void ComputePmeCUDADevice::sendForcesToNeighbors (  ) 

Definition at line 1502 of file ComputePmeCUDAMgr.C.

References PmeForcePencilMsg::force, PmeForcePencilMsg::numAtoms, PRIORITY_SIZE, PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, PmeForcePencilMsg::y, y, PmeGrid::yBlocks, PmeForcePencilMsg::z, z, and PmeGrid::zBlocks.

Referenced by gatherForceDone().

01502                                                  {
01503   // Primary pencil has the forces
01504   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01505   int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01506   // Loop through neighboring pencils
01507   for (int z=zlo;z <= zhi;z++) {
01508     for (int y=ylo;y <= yhi;y++) {
01509       // Only send to neighbors, not self
01510       if (y != 0 || z != 0) {
01511         int pp = y-ylo + (z-zlo)*yNBlocks;
01512         int patchIndex = neighborPatchIndex[pp];
01513         int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
01514         int atomEnd   = patchPos[patchIndex];
01515         int natom = atomEnd-atomStart;
01516         // copy forces
01517         PmeForcePencilMsg *msg = new (natom, PRIORITY_SIZE) PmeForcePencilMsg;
01518         msg->numAtoms = natom;
01519         memcpy(msg->force, force+atomStart, natom*sizeof(CudaForce));
01520         // Calculate destination pencil index (dstY, dstZ) for this neighbor
01521         int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01522         int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01523         int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
01524         msg->y = dstY;
01525         msg->z = dstZ;
01526         // Store source pencil index
01527         msg->srcY = pencilIndexY;
01528         msg->srcZ = pencilIndexZ;
01529         mgrProxy[node].recvForcesFromNeighbor(msg);
01530       }
01531     }
01532   }
01533 }

void ComputePmeCUDADevice::sendForcesToPatch ( PmeForceMsg forceMsg  ) 

Definition at line 1697 of file ComputePmeCUDAMgr.C.

References PmeForceMsg::compute, Compute::localWorkMsg, PmeForceMsg::pe, and ComputePmeCUDA::storePmeForceMsg().

Referenced by mergeForcesOnPatch().

01697                                                                   {
01698   // Now we're on the node that has Pe, hence "compute" -pointer is valid
01699   int pe                  = forceMsg->pe;
01700   ComputePmeCUDA *compute = forceMsg->compute;
01701 
01702   // Store message for use in ComputePmeCUDA, where it'll also be deleted.
01703   if (compute->storePmeForceMsg(forceMsg)) {
01704     // Enqueue on the pe that sent the atoms in the first place
01705     LocalWorkMsg *lmsg = compute->localWorkMsg;
01706     CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
01707     wdProxy[pe].enqueuePme(lmsg);
01708   }
01709 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilX  pmePencilX_in  ) 

Definition at line 1096 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01096                                                                              {
01097   if (pmePencilType != 1)
01098     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
01099   pmePencilX = pmePencilX_in;
01100 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXY  pmePencilXY_in  ) 

Definition at line 1090 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01090                                                                                {
01091   if (pmePencilType != 2)
01092     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
01093   pmePencilXY = pmePencilXY_in;
01094 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXYZ  pmePencilXYZ_in  ) 

Definition at line 1084 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01084                                                                                  {
01085   if (pmePencilType != 3)
01086     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
01087   pmePencilXYZ = pmePencilXYZ_in;
01088 }

void ComputePmeCUDADevice::spreadCharge (  ) 

Definition at line 1375 of file ComputePmeCUDAMgr.C.

References atoms, PmeRealSpaceCompute::copyAtoms(), PmeRunMsg::doEnergy, PmeRunMsg::doVirial, PmeRunMsg::lattice, PmeRunMsg::numStrayAtoms, Node::Object(), Node::simParameters, simParams, PmeRealSpaceCompute::spreadCharge(), and msm::swap().

Referenced by registerRecvAtomsFromNeighbor().

01375                                         {
01376   // Spread charges in primary pencil
01377   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01378   // Primary pencil is done, finish it up before accessing it
01379   // (clearing is done in mergeForcesOnPatch)
01380   pmeAtomStorage[atomI][pp0]->finish();
01381   // Get the number of atoms and pointer to atoms
01382   int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
01383   CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
01384   // Flip atomI <-> forceI
01385   std::swap(atomI, forceI);
01386   // Re-allocate force buffer if needed
01387   reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
01388   // Setup patches and atoms
01389   SimParameters *simParams = Node::Object()->simParameters;
01390   Lattice lattice = simParams->lattice;
01391   pmeRealSpaceCompute->copyAtoms(numAtoms, atoms);
01392   // Spread charge
01393   beforeWalltime = CmiWallTimer();
01394   pmeRealSpaceCompute->spreadCharge(lattice);
01395   // Send "charge grid ready to PME solver"
01396   PmeRunMsg *pmeRunMsg = new PmeRunMsg();
01397   pmeRunMsg->doVirial = doVirial;
01398   pmeRunMsg->doEnergy = doEnergy;
01399   pmeRunMsg->lattice = lattice;
01400   pmeRunMsg->numStrayAtoms = numStrayAtoms;
01401   // Reset stray atom counter
01402   numStrayAtoms = 0;
01403   switch(pmePencilType) {
01404     case 1:
01405     pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
01406     break;
01407     case 2:
01408     pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
01409     break;
01410     case 3:
01411     pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
01412     break;
01413   }
01414 }


The documentation for this class was generated from the following files:
Generated on Tue Nov 21 01:17:18 2017 for NAMD by  doxygen 1.4.7