ComputePmeCUDADevice Class Reference

#include <ComputePmeCUDAMgr.h>

List of all members.

Public Member Functions

 ComputePmeCUDADevice ()
 ComputePmeCUDADevice (CkMigrateMessage *m)
 ~ComputePmeCUDADevice ()
void initialize (PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
int getDeviceID ()
cudaStream_t getStream ()
CProxy_ComputePmeCUDAMgr getMgrProxy ()
void setPencilProxy (CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
void setPencilProxy (CProxy_CudaPmePencilXY pmePencilXY_in)
void setPencilProxy (CProxy_CudaPmePencilX pmePencilX_in)
void activate_pencils ()
void initializePatches (int numHomePatches_in)
void registerNeighbor ()
void recvAtoms (PmeAtomMsg *msg)
void sendAtomsToNeighbors ()
void sendAtomsToNeighbor (int y, int z, int atomIval)
void recvAtomsFromNeighbor (PmeAtomPencilMsg *msg)
void registerRecvAtomsFromNeighbor ()
void spreadCharge ()
void gatherForce ()
void gatherForceDone ()
void sendForcesToNeighbors ()
void recvForcesFromNeighbor (PmeForcePencilMsg *msg)
void mergeForcesOnPatch (int homePatchIndex)
void sendForcesToPatch (PmeForceMsg *forceMsg)
void gatherForceDoneSubset (int first, int last)

Classes

struct  PencilLocation


Detailed Description

Definition at line 291 of file ComputePmeCUDAMgr.h.


Constructor & Destructor Documentation

ComputePmeCUDADevice::ComputePmeCUDADevice (  ) 

Definition at line 938 of file ComputePmeCUDAMgr.C.

00938                                            {
00939   // __sdag_init();
00940   numHomePatches = 0;
00941   forceCapacity = 0;
00942   force = NULL;
00943   pmeRealSpaceCompute = NULL;
00944   streamCreated = false;
00945   lock_numHomePatchesMerged = CmiCreateLock();
00946   lock_numPencils = CmiCreateLock();
00947   lock_numNeighborsRecv = CmiCreateLock();
00948   lock_recvAtoms = CmiCreateLock();
00949   numNeighborsExpected = 0;
00950   numStrayAtoms = 0;
00951   // Reset counters
00952   numNeighborsRecv = 0;
00953   numHomePatchesRecv = 0;
00954   numHomePatchesMerged = 0;
00955   atomI = 0;
00956   forceI = 1;
00957 }

ComputePmeCUDADevice::ComputePmeCUDADevice ( CkMigrateMessage *  m  ) 

Definition at line 959 of file ComputePmeCUDAMgr.C.

00959                                                               {
00960   // __sdag_init();
00961   numHomePatches = 0;
00962   forceCapacity = 0;
00963   force = NULL;
00964   pmeRealSpaceCompute = NULL;
00965   streamCreated = false;
00966   lock_numHomePatchesMerged = CmiCreateLock();
00967   lock_numPencils = CmiCreateLock();
00968   lock_numNeighborsRecv = CmiCreateLock();
00969   lock_recvAtoms = CmiCreateLock();
00970   numNeighborsExpected = 0;
00971   numStrayAtoms = 0;
00972   // Reset counters
00973   numNeighborsRecv = 0;
00974   numHomePatchesRecv = 0;
00975   numHomePatchesMerged = 0;
00976   atomI = 0;
00977   forceI = 1;
00978 }

ComputePmeCUDADevice::~ComputePmeCUDADevice (  ) 

Definition at line 980 of file ComputePmeCUDAMgr.C.

References cudaCheck, and j.

00980                                             {
00981   if (streamCreated) {
00982     cudaCheck(cudaSetDevice(deviceID));
00983     cudaCheck(cudaStreamDestroy(stream));
00984   }
00985   for (int j=0;j < 2;j++)
00986     for (int i=0;i < pmeAtomStorage[j].size();i++) {
00987       if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
00988     }
00989   if (force != NULL) deallocate_host<CudaForce>(&force);
00990   if (pmeRealSpaceCompute != NULL) delete pmeRealSpaceCompute;
00991   CmiDestroyLock(lock_numHomePatchesMerged);
00992   CmiDestroyLock(lock_numPencils);
00993   CmiDestroyLock(lock_numNeighborsRecv);
00994   CmiDestroyLock(lock_recvAtoms);
00995 }


Member Function Documentation

void ComputePmeCUDADevice::activate_pencils (  ) 

Definition at line 1097 of file ComputePmeCUDAMgr.C.

References PmeStartMsg::data, PmeStartMsg::dataSize, PmeRealSpaceCompute::getData(), and PmeRealSpaceCompute::getDataSize().

01097                                             {
01098   if (pmePencilType == 1) {
01099     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01100     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01101     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01102     pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
01103   } else if (pmePencilType == 2) {
01104     PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
01105     pmeStartXMsg->data = pmeRealSpaceCompute->getData();
01106     pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01107     pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
01108   } else if (pmePencilType == 3) {
01109     PmeStartMsg* pmeStartMsg = new PmeStartMsg();
01110     pmeStartMsg->data = pmeRealSpaceCompute->getData();
01111     pmeStartMsg->dataSize = pmeRealSpaceCompute->getDataSize();
01112     pmePencilXYZ[0].start(pmeStartMsg);
01113   }
01114 }

void ComputePmeCUDADevice::gatherForce (  ) 

Definition at line 1414 of file ComputePmeCUDAMgr.C.

References CUDA_PME_SPREADCHARGE_EVENT, PmeRealSpaceCompute::gatherForce(), Node::Object(), Node::simParameters, and simParams.

01414                                        {
01415   traceUserBracketEvent(CUDA_PME_SPREADCHARGE_EVENT, beforeWalltime, CmiWallTimer());
01416   beforeWalltime = CmiWallTimer();
01417   // gather (i.e. un-grid) forces
01418   SimParameters *simParams = Node::Object()->simParameters;
01419   Lattice lattice = simParams->lattice;
01420   pmeRealSpaceCompute->gatherForce(lattice, force);
01421   // Set callback that will call gatherForceDone() once gatherForce is done
01422   ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->gatherForceSetCallback(this);
01423   // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
01424   // gatherForceDone();
01425 }

void ComputePmeCUDADevice::gatherForceDone (  ) 

Definition at line 1451 of file ComputePmeCUDAMgr.C.

References CUDA_PME_GATHERFORCE_EVENT, gatherForceDoneLoop(), mergeForcesOnPatch(), Node::Object(), sendForcesToNeighbors(), Node::simParameters, and SimParameters::useCkLoop.

01451                                            {
01452   // Primary pencil has the forces
01453 
01454   traceUserBracketEvent(CUDA_PME_GATHERFORCE_EVENT, beforeWalltime, CmiWallTimer());
01455 
01456   // Send forces to neighbors
01457   sendForcesToNeighbors();
01458 
01459 #if CMK_SMP && USE_CKLOOP
01460   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01461   if (useCkLoop >= 1) {
01462     CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
01463   } else
01464 #endif
01465 
01466   {
01467     // Loop through home patches and mark the primary pencil as "done"
01468     for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
01469       bool done = false;
01470       // ----------------------------- lock start ---------------------------
01471       // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01472       //       we really would only need to each element but this would required
01473       //       numHomePatches number of locks.
01474       if (pmePencilType != 3) CmiLock(lock_numPencils);
01475       numPencils[forceI][homePatchIndex]--;
01476       if (numPencils[forceI][homePatchIndex] == 0) done = true;
01477       if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01478       // ----------------------------- lock end  ---------------------------
01479       if (done) {
01480         // This home patch is done, launch force merging
01481         thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01482       }
01483     }
01484   }
01485 
01486   // In case we have no home patches, clear the primary pencil storage here
01487   if (numHomePatches == 0) {
01488     int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01489     pmeAtomStorage[forceI][pp0]->clear();
01490   }
01491 
01492 }

void ComputePmeCUDADevice::gatherForceDoneSubset ( int  first,
int  last 
)

Definition at line 1432 of file ComputePmeCUDAMgr.C.

References mergeForcesOnPatch().

Referenced by gatherForceDoneLoop().

01432                                                                     {
01433   for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
01434     bool done = false;
01435     // ----------------------------- lock start ---------------------------
01436     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01437     //       we really would only need to each element but this would required
01438     //       numHomePatches number of locks.
01439     if (pmePencilType != 3) CmiLock(lock_numPencils);
01440     numPencils[forceI][homePatchIndex]--;
01441     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01442     if (pmePencilType != 3) CmiUnlock(lock_numPencils);
01443     // ----------------------------- lock end  ---------------------------
01444     if (done) {
01445       // This home patch is done, launch force merging
01446       mergeForcesOnPatch(homePatchIndex);
01447     }
01448   }
01449 }

int ComputePmeCUDADevice::getDeviceID (  ) 

Definition at line 1071 of file ComputePmeCUDAMgr.C.

01071                                       {
01072   return deviceID;
01073 }

CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy (  ) 

Definition at line 1075 of file ComputePmeCUDAMgr.C.

01075                                                            {
01076   return mgrProxy;
01077 }

cudaStream_t ComputePmeCUDADevice::getStream (  ) 

Definition at line 1067 of file ComputePmeCUDAMgr.C.

01067                                              {
01068   return stream;
01069 }

void ComputePmeCUDADevice::initialize ( PmeGrid pmeGrid_in,
int  pencilIndexY_in,
int  pencilIndexZ_in,
int  deviceID_in,
int  pmePencilType_in,
CProxy_ComputePmeCUDAMgr  mgrProxy_in,
CProxy_PmeAtomFiler  pmeAtomFiler_in 
)

Definition at line 997 of file ComputePmeCUDAMgr.C.

References createStream(), cudaCheck, if(), j, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

00999                                        {
01000 
01001   deviceID = deviceID_in;
01002   cudaCheck(cudaSetDevice(deviceID));
01003   pmePencilType = pmePencilType_in;
01004   pmeGrid = pmeGrid_in;
01005   pencilIndexY = pencilIndexY_in;
01006   pencilIndexZ = pencilIndexZ_in;
01007   mgrProxy = mgrProxy_in;
01008   pmeAtomFiler = pmeAtomFiler_in;
01009   // Size of the neighboring pencil grid, max 3x3
01010   yNBlocks = std::min(pmeGrid.yBlocks, 3);
01011   zNBlocks = std::min(pmeGrid.zBlocks, 3);
01012   // Local pencil is at y=0,z=0
01013   if (yNBlocks == 1) {
01014     ylo = 0;
01015     yhi = 0;
01016   } else if (yNBlocks == 2) {
01017     ylo = -1;
01018     yhi = 0;
01019   } else {
01020     ylo = -1;
01021     yhi = 1;
01022   }
01023   if (zNBlocks == 1) {
01024     zlo = 0;
01025     zhi = 0;
01026   } else if (zNBlocks == 2) {
01027     zlo = -1;
01028     zhi = 0;
01029   } else {
01030     zlo = -1;
01031     zhi = 1;
01032   }
01033   
01034   neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
01035   // neighborForcePencils.resize(yNBlocks*zNBlocks);
01036   for (int j=0;j < 2;j++)
01037     homePatchIndexList[j].resize(yNBlocks*zNBlocks);
01038   neighborPatchIndex.resize(yNBlocks*zNBlocks);
01039 
01040   pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
01041   for (int j=0;j < 2;j++) {
01042     pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
01043     for (int z=zlo;z <= zhi;z++) {
01044       for (int y=ylo;y <= yhi;y++) {
01045         int pp = y-ylo + (z-zlo)*yNBlocks;
01046         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01047         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01048         if (y == 0 && z == 0) {
01049           // Primary pencil
01050           pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
01051         } else {
01052           pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
01053         }
01054         pmeAtomStorageAllocatedHere[pp] = true;
01055       }
01056     }
01057   }
01058 
01059   // Create stream for this device
01060   createStream(stream);
01061   streamCreated = true;
01062   pmeRealSpaceCompute = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ,
01063     deviceID, stream);
01064 
01065 }

void ComputePmeCUDADevice::initializePatches ( int  numHomePatches_in  ) 

Definition at line 1116 of file ComputePmeCUDAMgr.C.

References j, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

01116                                                                   {
01117   numHomePatches = numHomePatches_in;
01118   for (int j=0;j < 2;j++)
01119     numPencils[j].resize(numHomePatches);
01120   for (int j=0;j < 2;j++)
01121     plList[j].resize(numHomePatches);
01122   for (int j=0;j < 2;j++)
01123     homePatchForceMsgs[j].resize(numHomePatches);
01124   // for (int j=0;j < 2;j++)
01125   //   numHomeAtoms[j].resize(numHomePatches);
01126   // If we have home patches, register this pencil with the neighbors and with self
01127   if (numHomePatches > 0) {
01128     for (int z=zlo;z <= zhi;z++) {
01129       for (int y=ylo;y <= yhi;y++) {
01130         int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01131         int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01132         int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01133         mgrProxy[node].registerNeighbor(yt, zt);
01134       }
01135     }
01136   }
01137 }

void ComputePmeCUDADevice::mergeForcesOnPatch ( int  homePatchIndex  ) 

Definition at line 1585 of file ComputePmeCUDAMgr.C.

References j, sendForcesToPatch(), x, CudaForce::y, and CudaForce::z.

Referenced by gatherForceDone(), and gatherForceDoneSubset().

01585                                                                 {
01586   // We have all the forces for this patch => merge on a single Pe
01587 
01588   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01589 
01590   // Message that goes out to the compute
01591   PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
01592 
01593   if (pmePencilType == 3) {
01594     // 3D box => simple memory copy will do
01595     // Location of forces in the force[] array
01596     int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01597     // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
01598     int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01599     int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01600     int atomEnd   = patchPos[pencilPatchIndex];
01601     int numAtoms = atomEnd-atomStart;
01602     if (forceMsg->zeroCopy) {
01603       // Zero-copy, just pass the pointer
01604       forceMsg->force = force+atomStart;
01605     } else {
01606       memcpy(forceMsg->force, force+atomStart, numAtoms*sizeof(CudaForce));
01607     }
01608   } else {
01609 
01610     // Zero force array
01611     // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
01612     memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
01613 
01614     // Store forces from primary pencil
01615     {
01616       int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01617       int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
01618       int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
01619       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01620       int atomEnd   = patchPos[pencilPatchIndex];
01621       int numAtoms = atomEnd-atomStart;
01622 
01623       // Copy in local forces that are stored in the force[] array
01624       for (int i=0;i < numAtoms;i++) {
01625         forceMsg->force[index[atomStart + i]] = force[atomStart + i];
01626       }
01627 
01628     }
01629 
01630     // Add forces from neighboring pencils
01631     for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
01632       int pp               = plList[forceI][homePatchIndex][j].pp;
01633       int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
01634 
01635       int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
01636       int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
01637       int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
01638       int atomEnd   = patchPos[pencilPatchIndex];
01639       int numAtoms = atomEnd-atomStart;
01640       CudaForce *dstForce = forceMsg->force;
01641       // CudaForce *srcForce = neighborForcePencils[pp].force;
01642       CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
01643 
01644       for (int i=0;i < numAtoms;i++) {
01645         dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
01646         dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
01647         dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
01648       }
01649 
01650     }
01651   }
01652 
01653   // Clear storage
01654   plList[forceI][homePatchIndex].clear();
01655 
01656   // ----------------------------- lock start ---------------------------
01657   // bool done = false;
01658   CmiLock(lock_numHomePatchesMerged);
01659   numHomePatchesMerged++;
01660   if (numHomePatchesMerged == numHomePatches) {
01661     // Reset counter
01662     numHomePatchesMerged = 0;
01663 
01664     // Delete messages
01665     for (int i=0;i < neighborForcePencilMsgs.size();i++) {
01666       if (neighborForcePencilMsgs[i] != NULL) {
01667         delete neighborForcePencilMsgs[i];
01668         neighborForcePencilMsgs[i] = NULL;
01669       }
01670     }
01671 
01672     // Done merging and sending forces => clear storage
01673     for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
01674       homePatchIndexList[forceI][pp].clear();
01675     for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
01676       pmeAtomStorage[forceI][pp]->clear();
01677 
01678   }
01679   CmiUnlock(lock_numHomePatchesMerged);
01680   // ----------------------------- lock end  ---------------------------
01681 
01682   // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
01683   // this node will then rely the message to the Pe that originally sent the atoms
01684   int pe = forceMsg->pe;
01685   if (CkNodeOf(pe) != CkMyNode())
01686     thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
01687   else
01688     sendForcesToPatch(forceMsg);
01689 
01690 }

void ComputePmeCUDADevice::recvAtoms ( PmeAtomMsg msg  ) 

Definition at line 1148 of file ComputePmeCUDAMgr.C.

References PmeAtomMsg::atoms, PmeAtomMsg::compute, PmeForceMsg::compute, PmeAtomMsg::doEnergy, PmeAtomMsg::doVirial, PmeAtomFiler::fileAtoms(), PmeAtomFiler::getAtomIndex(), PmeAtomFiler::getNumAtoms(), j, NAMD_bug(), PmeForceMsg::numAtoms, PmeAtomMsg::numAtoms, PmeForceMsg::numStrayAtoms, Node::Object(), PmeForceMsg::pe, PmeAtomMsg::pe, PRIORITY_SIZE, sendAtomsToNeighbors(), Node::simParameters, simParams, y, PmeGrid::yBlocks, z, PmeGrid::zBlocks, and PmeForceMsg::zeroCopy.

01148                                                     {
01149 
01150   PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
01151   // Store "virial" and "energy" flags
01152   doVirial = msg->doVirial;
01153   doEnergy = msg->doEnergy;
01154   // Get lattice
01155   SimParameters *simParams = Node::Object()->simParameters;
01156   Lattice lattice = simParams->lattice;
01157 
01158   // Primary pencil index
01159   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01160   int p0 = 0;
01161   int pencilPatchIndex[9];
01162   int numStrayAtomsPatch = 0;
01163   if (pmePencilType == 3) {
01164     // 3D box => store atoms directly without index
01165     // NOTE: We don't check for stray atoms here!
01166     pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01167   } else {
01168 
01169     // File atoms
01170     pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
01171       pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
01172 
01173     // Loop through pencils and add atoms to pencil atom lists
01174     // NOTE: we only store to neighboring pencil if there are atoms to store
01175     int numAtomsCheck = 0;
01176     for (int p=0;p < 9;p++) {
01177 
01178       int y = (p % 3);
01179       int z = (p / 3);
01180 
01181       int pp = y + z*yNBlocks;
01182       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01183       if (pp == pp0) p0 = p;
01184       if (pp == pp0 || numAtoms > 0) {
01185         if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
01186           NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
01187         int* index = pmeAtomFilerPtr->getAtomIndex(p);
01188         pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index);
01189         // Number of patches in this storage tells you how many home patches contributed and
01190         // homePatchIndex (pe) tells you which patch contributed
01191         numAtomsCheck += numAtoms;
01192       }
01193     }
01194 
01195     // Deal with stray atoms
01196     numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
01197     if (numStrayAtomsPatch > 0) {
01198       int* index = pmeAtomFilerPtr->getAtomIndex(9);
01199       CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
01200       for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
01201         int j = index[i];
01202         CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
01203       }
01204     }
01205 
01206     if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
01207       NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
01208   }
01209 
01210   // Create storage for home patch forces
01211   PmeForceMsg *forceMsg;
01212   if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
01213     // 3D FFT and compute resides on the same node => use zero-copy forces
01214     forceMsg = new (0, PRIORITY_SIZE) PmeForceMsg();
01215     forceMsg->zeroCopy = true;
01216   } else {
01217     forceMsg = new (msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
01218     forceMsg->zeroCopy = false;
01219   }
01220   forceMsg->numAtoms = msg->numAtoms;
01221   forceMsg->pe = msg->pe;
01222   forceMsg->compute = msg->compute;
01223   forceMsg->numStrayAtoms = numStrayAtomsPatch;
01224 
01225   bool done = false;
01226   // ----------------------------- lock start ---------------------------
01227   // Only after writing has finished, we get homePatchIndex
01228   // This quarantees that for whatever thread that receives "done=true", writing has finished on
01229   // ALL threads.
01230   CmiLock(lock_recvAtoms);
01231   numStrayAtoms += numStrayAtomsPatch;
01232   // Secure homePatchIndex. All writes after this must be inside lock-region
01233   int homePatchIndex = numHomePatchesRecv;
01234   // Store primary pencil first
01235   plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
01236   if (pmePencilType != 3) {
01237     // Go back to through neighboring pencils and store "homePatchIndex"
01238     for (int p=0;p < 9;p++) {
01239 
01240       int y = (p % 3);
01241       int z = (p / 3);
01242 
01243       int pp = y + z*yNBlocks;
01244       int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
01245       if (pp != pp0 && numAtoms > 0) {
01246         homePatchIndexList[atomI][pp].push_back(homePatchIndex);
01247         // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
01248         //                                sharing it
01249         // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
01250         plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
01251       }
01252     }
01253   }
01254   homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
01255   // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
01256   // Set the number of pencils contributing to this home patch
01257   numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
01258   //
01259   numHomePatchesRecv++;
01260   if (numHomePatchesRecv == numHomePatches) {
01261     // Reset counter
01262     numHomePatchesRecv = 0;
01263     done = true;
01264   }
01265   CmiUnlock(lock_recvAtoms);
01266   // ----------------------------- lock end  ---------------------------
01267 
01268   // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
01269 
01270   delete msg;
01271 
01272   if (done) {
01273     // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
01274     sendAtomsToNeighbors();
01275   }
01276 }

void ComputePmeCUDADevice::recvAtomsFromNeighbor ( PmeAtomPencilMsg msg  ) 

Definition at line 1322 of file ComputePmeCUDAMgr.C.

References PmeAtomPencilMsg::atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, NAMD_bug(), PmeAtomPencilMsg::numAtoms, registerRecvAtomsFromNeighbor(), PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

01322                                                                       {
01323   // Store into primary pencil
01324   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01325   // Compute pencil index relative to primary pencil
01326   int y = msg->srcY - pencilIndexY;
01327   if (y < ylo) y += pmeGrid.yBlocks;
01328   if (y > yhi) y -= pmeGrid.yBlocks;
01329   int z = msg->srcZ - pencilIndexZ;
01330   if (z < zlo) z += pmeGrid.zBlocks;
01331   if (z > zhi) z -= pmeGrid.zBlocks;
01332   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01333     NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
01334   }
01335   // Read energy and virial flags
01336   doEnergy = msg->doEnergy;
01337   doVirial = msg->doVirial;
01338   // Pencil index where atoms came from
01339   int pp = y-ylo + (z-zlo)*yNBlocks;
01340   // Store atoms and mark down the patch index where these atoms were added
01341   neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
01342 
01343   delete msg;
01344 
01345   registerRecvAtomsFromNeighbor();
01346 }

void ComputePmeCUDADevice::recvForcesFromNeighbor ( PmeForcePencilMsg msg  ) 

Definition at line 1530 of file ComputePmeCUDAMgr.C.

References NAMD_bug(), PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

01530                                                                         {
01531 
01532   // Source pencil index
01533   int y = msg->srcY - pencilIndexY;
01534   if (y < ylo) y += pmeGrid.yBlocks;
01535   if (y > yhi) y -= pmeGrid.yBlocks;
01536   int z = msg->srcZ - pencilIndexZ;
01537   if (z < zlo) z += pmeGrid.zBlocks;
01538   if (z > zhi) z -= pmeGrid.zBlocks;
01539 
01540   if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
01541     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
01542   }
01543 
01544   // Source pencil
01545   int pp = y-ylo + (z-zlo)*yNBlocks;
01546 
01547   // Store message (deleted in mergeForcesOnPatch)
01548   neighborForcePencilMsgs[pp] = msg;
01549 
01550   // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
01551   // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
01552   // neighborForcePencils[pp].numAtoms = msg->numAtoms;
01553   // neighborForcePencils[pp].y = msg->y;
01554   // neighborForcePencils[pp].z = msg->z;
01555   // neighborForcePencils[pp].srcY = msg->srcY;
01556   // neighborForcePencils[pp].srcZ = msg->srcZ;
01557   // delete msg;
01558 
01559   // numPatches = number of home patches this pencil has
01560   int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
01561   if (numPatches != homePatchIndexList[forceI][pp].size()) {
01562     NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
01563   }
01564   for (int i=0;i < numPatches;i++) {
01565     // this pencil contributed to home patch with index "homePatchIndex"
01566     int homePatchIndex = homePatchIndexList[forceI][pp][i];
01567     // ----------------------------- lock start ---------------------------
01568     // NOTE: We use node-wide lock here for the entire numPencils[] array, while
01569     //       we really would only need to each element but this would required
01570     //       numHomePatches number of locks.
01571     bool done = false;
01572     CmiLock(lock_numPencils);
01573     numPencils[forceI][homePatchIndex]--;
01574     if (numPencils[forceI][homePatchIndex] == 0) done = true;
01575     CmiUnlock(lock_numPencils);
01576     // ----------------------------- lock end  ---------------------------
01577     if (done) {
01578       // This home patch is done, launch force merging
01579       thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
01580     }
01581   }
01582 
01583 }

void ComputePmeCUDADevice::registerNeighbor (  ) 

Definition at line 1139 of file ComputePmeCUDAMgr.C.

01139                                             {
01140   CmiLock(lock_numHomePatchesMerged);
01141   numNeighborsExpected++;
01142   CmiUnlock(lock_numHomePatchesMerged);
01143 }

void ComputePmeCUDADevice::registerRecvAtomsFromNeighbor (  ) 

Definition at line 1348 of file ComputePmeCUDAMgr.C.

References spreadCharge().

Referenced by recvAtomsFromNeighbor(), and sendAtomsToNeighbors().

01348                                                          {
01349   // Primary pencil
01350   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01351 
01352   bool done = false;
01353   // ----------------------------- lock start ---------------------------
01354   CmiLock(lock_numNeighborsRecv);
01355   numNeighborsRecv++;
01356   if (numNeighborsRecv == numNeighborsExpected) {
01357     // Reset counter
01358     numNeighborsRecv = 0;
01359     done = true;
01360   }
01361   CmiUnlock(lock_numNeighborsRecv);
01362   // ----------------------------- lock end  ---------------------------
01363 
01364   if (done) {
01365     // Primary pencil has received all atoms and writing has finished => spread charge
01366     spreadCharge();
01367   }  
01368 }

void ComputePmeCUDADevice::sendAtomsToNeighbor ( int  y,
int  z,
int  atomIval 
)

Definition at line 1296 of file ComputePmeCUDAMgr.C.

References PmeAtomPencilMsg::atoms, atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, PmeAtomPencilMsg::numAtoms, PRIORITY_SIZE, PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, PmeAtomPencilMsg::y, PmeGrid::yBlocks, PmeAtomPencilMsg::z, and PmeGrid::zBlocks.

Referenced by sendAtomsToNeighbors().

01296                                                                          {
01297   // Pencil index  
01298   int pp = y-ylo + (z-zlo)*yNBlocks;
01299   // This neighbor pencil is done, finish it up before accessing it
01300   pmeAtomStorage[atomIval][pp]->finish();
01301   // Compute destination neighbor pencil index (yt,zt)
01302   int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01303   int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01304   int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
01305   CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
01306   PmeAtomPencilMsg* msgPencil = new (numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
01307   memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
01308   msgPencil->numAtoms = numAtoms;
01309   // Store destination pencil index
01310   msgPencil->y = yt;
01311   msgPencil->z = zt;
01312   // Store source pencil index
01313   msgPencil->srcY = pencilIndexY;
01314   msgPencil->srcZ = pencilIndexZ;
01315   // Store energy and virial flags
01316   msgPencil->doEnergy = doEnergy;
01317   msgPencil->doVirial = doVirial;
01318   int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
01319   mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
01320 }

void ComputePmeCUDADevice::sendAtomsToNeighbors (  ) 

Definition at line 1281 of file ComputePmeCUDAMgr.C.

References registerRecvAtomsFromNeighbor(), sendAtomsToNeighbor(), y, and z.

Referenced by recvAtoms().

01281                                                 {
01282   for (int z=zlo;z <= zhi;z++) {
01283     for (int y=ylo;y <= yhi;y++) {
01284       // Only send to neighbors, not self
01285       if (y != 0 || z != 0) {
01286         // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
01287         // before these sends have been performed
01288         thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
01289       }
01290     }
01291   }
01292   // Register primary pencil
01293   registerRecvAtomsFromNeighbor();
01294 }

void ComputePmeCUDADevice::sendForcesToNeighbors (  ) 

Definition at line 1497 of file ComputePmeCUDAMgr.C.

References PmeForcePencilMsg::force, PmeForcePencilMsg::numAtoms, PRIORITY_SIZE, PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, PmeForcePencilMsg::y, y, PmeGrid::yBlocks, PmeForcePencilMsg::z, z, and PmeGrid::zBlocks.

Referenced by gatherForceDone().

01497                                                  {
01498   // Primary pencil has the forces
01499   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01500   int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
01501   // Loop through neighboring pencils
01502   for (int z=zlo;z <= zhi;z++) {
01503     for (int y=ylo;y <= yhi;y++) {
01504       // Only send to neighbors, not self
01505       if (y != 0 || z != 0) {
01506         int pp = y-ylo + (z-zlo)*yNBlocks;
01507         int patchIndex = neighborPatchIndex[pp];
01508         int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
01509         int atomEnd   = patchPos[patchIndex];
01510         int natom = atomEnd-atomStart;
01511         // copy forces
01512         PmeForcePencilMsg *msg = new (natom, PRIORITY_SIZE) PmeForcePencilMsg;
01513         msg->numAtoms = natom;
01514         memcpy(msg->force, force+atomStart, natom*sizeof(CudaForce));
01515         // Calculate destination pencil index (dstY, dstZ) for this neighbor
01516         int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
01517         int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
01518         int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
01519         msg->y = dstY;
01520         msg->z = dstZ;
01521         // Store source pencil index
01522         msg->srcY = pencilIndexY;
01523         msg->srcZ = pencilIndexZ;
01524         mgrProxy[node].recvForcesFromNeighbor(msg);
01525       }
01526     }
01527   }
01528 }

void ComputePmeCUDADevice::sendForcesToPatch ( PmeForceMsg forceMsg  ) 

Definition at line 1692 of file ComputePmeCUDAMgr.C.

References PmeForceMsg::compute, Compute::localWorkMsg, PmeForceMsg::pe, and ComputePmeCUDA::storePmeForceMsg().

Referenced by mergeForcesOnPatch().

01692                                                                   {
01693   // Now we're on the node that has Pe, hence "compute" -pointer is valid
01694   int pe                  = forceMsg->pe;
01695   ComputePmeCUDA *compute = forceMsg->compute;
01696 
01697   // Store message for use in ComputePmeCUDA, where it'll also be deleted.
01698   if (compute->storePmeForceMsg(forceMsg)) {
01699     // Enqueue on the pe that sent the atoms in the first place
01700     LocalWorkMsg *lmsg = compute->localWorkMsg;
01701     CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
01702     wdProxy[pe].enqueuePme(lmsg);
01703   }
01704 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilX  pmePencilX_in  ) 

Definition at line 1091 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01091                                                                              {
01092   if (pmePencilType != 1)
01093     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
01094   pmePencilX = pmePencilX_in;
01095 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXY  pmePencilXY_in  ) 

Definition at line 1085 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01085                                                                                {
01086   if (pmePencilType != 2)
01087     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
01088   pmePencilXY = pmePencilXY_in;
01089 }

void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXYZ  pmePencilXYZ_in  ) 

Definition at line 1079 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

01079                                                                                  {
01080   if (pmePencilType != 3)
01081     NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
01082   pmePencilXYZ = pmePencilXYZ_in;
01083 }

void ComputePmeCUDADevice::spreadCharge (  ) 

Definition at line 1370 of file ComputePmeCUDAMgr.C.

References atoms, PmeRealSpaceCompute::copyAtoms(), PmeRunMsg::doEnergy, PmeRunMsg::doVirial, PmeRunMsg::lattice, PmeRunMsg::numStrayAtoms, Node::Object(), Node::simParameters, simParams, PmeRealSpaceCompute::spreadCharge(), and msm::swap().

Referenced by registerRecvAtomsFromNeighbor().

01370                                         {
01371   // Spread charges in primary pencil
01372   int pp0 = 0-ylo + (0-zlo)*yNBlocks;
01373   // Primary pencil is done, finish it up before accessing it
01374   // (clearing is done in mergeForcesOnPatch)
01375   pmeAtomStorage[atomI][pp0]->finish();
01376   // Get the number of atoms and pointer to atoms
01377   int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
01378   CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
01379   // Flip atomI <-> forceI
01380   std::swap(atomI, forceI);
01381   // Re-allocate force buffer if needed
01382   reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
01383   // Setup patches and atoms
01384   SimParameters *simParams = Node::Object()->simParameters;
01385   Lattice lattice = simParams->lattice;
01386   pmeRealSpaceCompute->copyAtoms(numAtoms, atoms);
01387   // Spread charge
01388   beforeWalltime = CmiWallTimer();
01389   pmeRealSpaceCompute->spreadCharge(lattice);
01390   // Send "charge grid ready to PME solver"
01391   PmeRunMsg *pmeRunMsg = new PmeRunMsg();
01392   pmeRunMsg->doVirial = doVirial;
01393   pmeRunMsg->doEnergy = doEnergy;
01394   pmeRunMsg->lattice = lattice;
01395   pmeRunMsg->numStrayAtoms = numStrayAtoms;
01396   // Reset stray atom counter
01397   numStrayAtoms = 0;
01398   switch(pmePencilType) {
01399     case 1:
01400     pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
01401     break;
01402     case 2:
01403     pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
01404     break;
01405     case 3:
01406     pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
01407     break;
01408   }
01409 }


The documentation for this class was generated from the following files:
Generated on Mon Jun 18 01:17:17 2018 for NAMD by  doxygen 1.4.7