NAMD
Classes | Public Member Functions | List of all members
ComputePmeCUDADevice Class Reference

#include <ComputePmeCUDAMgr.h>

Inheritance diagram for ComputePmeCUDADevice:

Public Member Functions

 ComputePmeCUDADevice ()
 
 ComputePmeCUDADevice (CkMigrateMessage *m)
 
 ~ComputePmeCUDADevice ()
 
void initialize (PmeGrid &pmeGrid_in, int pencilIndexY_in, int pencilIndexZ_in, int deviceID_in, int pmePencilType_in, CProxy_ComputePmeCUDAMgr mgrProxy_in, CProxy_PmeAtomFiler pmeAtomFiler_in)
 
int getDeviceID ()
 
cudaStream_t getStream ()
 
CProxy_ComputePmeCUDAMgr getMgrProxy ()
 
void setPencilProxy (CProxy_CudaPmePencilXYZ pmePencilXYZ_in)
 
void setPencilProxy (CProxy_CudaPmePencilXY pmePencilXY_in)
 
void setPencilProxy (CProxy_CudaPmePencilX pmePencilX_in)
 
void activate_pencils ()
 
void initializePatches (int numHomePatches_in)
 
void registerNeighbor ()
 
void recvAtoms (PmeAtomMsg *msg)
 
void sendAtomsToNeighbors ()
 
void sendAtomsToNeighbor (int y, int z, int atomIval)
 
void recvAtomsFromNeighbor (PmeAtomPencilMsg *msg)
 
void registerRecvAtomsFromNeighbor ()
 
void spreadCharge ()
 
void gatherForce ()
 
void gatherForceDone ()
 
void sendForcesToNeighbors ()
 
void recvForcesFromNeighbor (PmeForcePencilMsg *msg)
 
void mergeForcesOnPatch (int homePatchIndex)
 
void sendForcesToPatch (PmeForceMsg *forceMsg)
 
void gatherForceDoneSubset (int first, int last)
 

Detailed Description

Definition at line 297 of file ComputePmeCUDAMgr.h.

Constructor & Destructor Documentation

ComputePmeCUDADevice::ComputePmeCUDADevice ( )

Definition at line 938 of file ComputePmeCUDAMgr.C.

938  {
939  // __sdag_init();
940  numHomePatches = 0;
941  forceCapacity = 0;
942  force = NULL;
943  pmeRealSpaceCompute = NULL;
944  streamCreated = false;
945  lock_numHomePatchesMerged = CmiCreateLock();
946  lock_numPencils = CmiCreateLock();
947  lock_numNeighborsRecv = CmiCreateLock();
948  lock_recvAtoms = CmiCreateLock();
949  numNeighborsExpected = 0;
950  numStrayAtoms = 0;
951  // Reset counters
952  numNeighborsRecv = 0;
953  numHomePatchesRecv = 0;
954  numHomePatchesMerged = 0;
955  atomI = 0;
956  forceI = 1;
957 }
ComputePmeCUDADevice::ComputePmeCUDADevice ( CkMigrateMessage *  m)

Definition at line 959 of file ComputePmeCUDAMgr.C.

959  {
960  // __sdag_init();
961  numHomePatches = 0;
962  forceCapacity = 0;
963  force = NULL;
964  pmeRealSpaceCompute = NULL;
965  streamCreated = false;
966  lock_numHomePatchesMerged = CmiCreateLock();
967  lock_numPencils = CmiCreateLock();
968  lock_numNeighborsRecv = CmiCreateLock();
969  lock_recvAtoms = CmiCreateLock();
970  numNeighborsExpected = 0;
971  numStrayAtoms = 0;
972  // Reset counters
973  numNeighborsRecv = 0;
974  numHomePatchesRecv = 0;
975  numHomePatchesMerged = 0;
976  atomI = 0;
977  forceI = 1;
978 }
ComputePmeCUDADevice::~ComputePmeCUDADevice ( )

Definition at line 980 of file ComputePmeCUDAMgr.C.

References cudaCheck.

980  {
981  if (streamCreated) {
982  cudaCheck(cudaSetDevice(deviceID));
983  cudaCheck(cudaStreamDestroy(stream));
984  }
985  for (int j=0;j < 2;j++)
986  for (int i=0;i < pmeAtomStorage[j].size();i++) {
987  if (pmeAtomStorageAllocatedHere[i]) delete pmeAtomStorage[j][i];
988  }
989  if (force != NULL) deallocate_host<CudaForce>(&force);
990  if (pmeRealSpaceCompute != NULL) delete pmeRealSpaceCompute;
991  CmiDestroyLock(lock_numHomePatchesMerged);
992  CmiDestroyLock(lock_numPencils);
993  CmiDestroyLock(lock_numNeighborsRecv);
994  CmiDestroyLock(lock_recvAtoms);
995 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95

Member Function Documentation

void ComputePmeCUDADevice::activate_pencils ( )

Definition at line 1097 of file ComputePmeCUDAMgr.C.

References PmeStartMsg::data, PmeStartMsg::dataSize, PmeRealSpaceCompute::getData(), and PmeRealSpaceCompute::getDataSize().

1097  {
1098  if (pmePencilType == 1) {
1099  PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
1100  pmeStartXMsg->data = pmeRealSpaceCompute->getData();
1101  pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
1102  pmePencilX(0, pencilIndexY, pencilIndexZ).start(pmeStartXMsg);
1103  } else if (pmePencilType == 2) {
1104  PmeStartMsg* pmeStartXMsg = new PmeStartMsg();
1105  pmeStartXMsg->data = pmeRealSpaceCompute->getData();
1106  pmeStartXMsg->dataSize = pmeRealSpaceCompute->getDataSize();
1107  pmePencilXY(0, 0, pencilIndexZ).start(pmeStartXMsg);
1108  } else if (pmePencilType == 3) {
1109  PmeStartMsg* pmeStartMsg = new PmeStartMsg();
1110  pmeStartMsg->data = pmeRealSpaceCompute->getData();
1111  pmeStartMsg->dataSize = pmeRealSpaceCompute->getDataSize();
1112  pmePencilXYZ[0].start(pmeStartMsg);
1113  }
1114 }
int dataSize
Definition: PmeSolver.h:104
float * data
Definition: PmeSolver.h:103
void ComputePmeCUDADevice::gatherForce ( )

Definition at line 1415 of file ComputePmeCUDAMgr.C.

References CUDA_PME_SPREADCHARGE_EVENT, and PmeRealSpaceCompute::gatherForce().

1415  {
1416  traceUserBracketEvent(CUDA_PME_SPREADCHARGE_EVENT, beforeWalltime, CmiWallTimer());
1417  beforeWalltime = CmiWallTimer();
1418  // (already have the updated lattice)
1419  pmeRealSpaceCompute->gatherForce(lattice, force);
1420  // Set callback that will call gatherForceDone() once gatherForce is done
1421  ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->gatherForceSetCallback(this);
1422  // ((CudaPmeRealSpaceCompute*)pmeRealSpaceCompute)->waitGatherForceDone();
1423  // gatherForceDone();
1424 }
virtual void gatherForce(Lattice &lattice, CudaForce *force)=0
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:12
void ComputePmeCUDADevice::gatherForceDone ( )

Definition at line 1450 of file ComputePmeCUDAMgr.C.

References CUDA_PME_GATHERFORCE_EVENT, gatherForceDoneLoop(), Node::Object(), sendForcesToNeighbors(), Node::simParameters, and SimParameters::useCkLoop.

1450  {
1451  // Primary pencil has the forces
1452 
1453  traceUserBracketEvent(CUDA_PME_GATHERFORCE_EVENT, beforeWalltime, CmiWallTimer());
1454 
1455  // Send forces to neighbors
1457 
1458 #if CMK_SMP && USE_CKLOOP
1459  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1460  if (useCkLoop >= 1) {
1461  CkLoop_Parallelize(gatherForceDoneLoop, 1, (void *)this, CkMyNodeSize(), 0, numHomePatches-1);
1462  } else
1463 #endif
1464 
1465  {
1466  // Loop through home patches and mark the primary pencil as "done"
1467  for (int homePatchIndex=0;homePatchIndex < numHomePatches;homePatchIndex++) {
1468  bool done = false;
1469  // ----------------------------- lock start ---------------------------
1470  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1471  // we really would only need to each element but this would required
1472  // numHomePatches number of locks.
1473  if (pmePencilType != 3) CmiLock(lock_numPencils);
1474  numPencils[forceI][homePatchIndex]--;
1475  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1476  if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1477  // ----------------------------- lock end ---------------------------
1478  if (done) {
1479  // This home patch is done, launch force merging
1480  thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1481  }
1482  }
1483  }
1484 
1485  // In case we have no home patches, clear the primary pencil storage here
1486  if (numHomePatches == 0) {
1487  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1488  pmeAtomStorage[forceI][pp0]->clear();
1489  }
1490 
1491 }
static Node * Object()
Definition: Node.h:86
static void gatherForceDoneLoop(int first, int last, void *result, int paraNum, void *param)
SimParameters * simParameters
Definition: Node.h:178
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:13
void ComputePmeCUDADevice::gatherForceDoneSubset ( int  first,
int  last 
)

Definition at line 1431 of file ComputePmeCUDAMgr.C.

References mergeForcesOnPatch().

Referenced by gatherForceDoneLoop().

1431  {
1432  for (int homePatchIndex=first;homePatchIndex <= last;homePatchIndex++) {
1433  bool done = false;
1434  // ----------------------------- lock start ---------------------------
1435  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1436  // we really would only need to each element but this would required
1437  // numHomePatches number of locks.
1438  if (pmePencilType != 3) CmiLock(lock_numPencils);
1439  numPencils[forceI][homePatchIndex]--;
1440  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1441  if (pmePencilType != 3) CmiUnlock(lock_numPencils);
1442  // ----------------------------- lock end ---------------------------
1443  if (done) {
1444  // This home patch is done, launch force merging
1445  mergeForcesOnPatch(homePatchIndex);
1446  }
1447  }
1448 }
void mergeForcesOnPatch(int homePatchIndex)
int ComputePmeCUDADevice::getDeviceID ( )

Definition at line 1071 of file ComputePmeCUDAMgr.C.

1071  {
1072  return deviceID;
1073 }
CProxy_ComputePmeCUDAMgr ComputePmeCUDADevice::getMgrProxy ( )

Definition at line 1075 of file ComputePmeCUDAMgr.C.

1075  {
1076  return mgrProxy;
1077 }
cudaStream_t ComputePmeCUDADevice::getStream ( )

Definition at line 1067 of file ComputePmeCUDAMgr.C.

1067  {
1068  return stream;
1069 }
void ComputePmeCUDADevice::initialize ( PmeGrid pmeGrid_in,
int  pencilIndexY_in,
int  pencilIndexZ_in,
int  deviceID_in,
int  pmePencilType_in,
CProxy_ComputePmeCUDAMgr  mgrProxy_in,
CProxy_PmeAtomFiler  pmeAtomFiler_in 
)

Definition at line 997 of file ComputePmeCUDAMgr.C.

References createStream(), cudaCheck, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

999  {
1000 
1001  deviceID = deviceID_in;
1002  cudaCheck(cudaSetDevice(deviceID));
1003  pmePencilType = pmePencilType_in;
1004  pmeGrid = pmeGrid_in;
1005  pencilIndexY = pencilIndexY_in;
1006  pencilIndexZ = pencilIndexZ_in;
1007  mgrProxy = mgrProxy_in;
1008  pmeAtomFiler = pmeAtomFiler_in;
1009  // Size of the neighboring pencil grid, max 3x3
1010  yNBlocks = std::min(pmeGrid.yBlocks, 3);
1011  zNBlocks = std::min(pmeGrid.zBlocks, 3);
1012  // Local pencil is at y=0,z=0
1013  if (yNBlocks == 1) {
1014  ylo = 0;
1015  yhi = 0;
1016  } else if (yNBlocks == 2) {
1017  ylo = -1;
1018  yhi = 0;
1019  } else {
1020  ylo = -1;
1021  yhi = 1;
1022  }
1023  if (zNBlocks == 1) {
1024  zlo = 0;
1025  zhi = 0;
1026  } else if (zNBlocks == 2) {
1027  zlo = -1;
1028  zhi = 0;
1029  } else {
1030  zlo = -1;
1031  zhi = 1;
1032  }
1033 
1034  neighborForcePencilMsgs.resize(yNBlocks*zNBlocks, NULL);
1035  // neighborForcePencils.resize(yNBlocks*zNBlocks);
1036  for (int j=0;j < 2;j++)
1037  homePatchIndexList[j].resize(yNBlocks*zNBlocks);
1038  neighborPatchIndex.resize(yNBlocks*zNBlocks);
1039 
1040  pmeAtomStorageAllocatedHere.resize(yNBlocks*zNBlocks, false);
1041  for (int j=0;j < 2;j++) {
1042  pmeAtomStorage[j].resize(yNBlocks*zNBlocks, NULL);
1043  for (int z=zlo;z <= zhi;z++) {
1044  for (int y=ylo;y <= yhi;y++) {
1045  int pp = y-ylo + (z-zlo)*yNBlocks;
1046  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1047  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1048  if (y == 0 && z == 0) {
1049  // Primary pencil
1050  pmeAtomStorage[j][pp] = new CudaPmeAtomStorage(pmePencilType != 3);
1051  } else {
1052  pmeAtomStorage[j][pp] = new CpuPmeAtomStorage(pmePencilType != 3);
1053  }
1054  pmeAtomStorageAllocatedHere[pp] = true;
1055  }
1056  }
1057  }
1058 
1059  // Create stream for this device
1060  createStream(stream);
1061  streamCreated = true;
1062  pmeRealSpaceCompute = new CudaPmeRealSpaceCompute(pmeGrid, pencilIndexY, pencilIndexZ,
1063  deviceID, stream);
1064 
1065 }
int zBlocks
Definition: PmeBase.h:22
int yBlocks
Definition: PmeBase.h:22
gridSize z
void createStream(cudaStream_t &stream)
gridSize y
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void ComputePmeCUDADevice::initializePatches ( int  numHomePatches_in)

Definition at line 1116 of file ComputePmeCUDAMgr.C.

References y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

1116  {
1117  numHomePatches = numHomePatches_in;
1118  for (int j=0;j < 2;j++)
1119  numPencils[j].resize(numHomePatches);
1120  for (int j=0;j < 2;j++)
1121  plList[j].resize(numHomePatches);
1122  for (int j=0;j < 2;j++)
1123  homePatchForceMsgs[j].resize(numHomePatches);
1124  // for (int j=0;j < 2;j++)
1125  // numHomeAtoms[j].resize(numHomePatches);
1126  // If we have home patches, register this pencil with the neighbors and with self
1127  if (numHomePatches > 0) {
1128  for (int z=zlo;z <= zhi;z++) {
1129  for (int y=ylo;y <= yhi;y++) {
1130  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1131  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1132  int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1133  mgrProxy[node].registerNeighbor(yt, zt);
1134  }
1135  }
1136  }
1137 }
int zBlocks
Definition: PmeBase.h:22
int yBlocks
Definition: PmeBase.h:22
gridSize z
gridSize y
void ComputePmeCUDADevice::mergeForcesOnPatch ( int  homePatchIndex)

Definition at line 1584 of file ComputePmeCUDAMgr.C.

References sendForcesToPatch(), CudaForce::x, CudaForce::y, and CudaForce::z.

Referenced by gatherForceDoneSubset().

1584  {
1585  // We have all the forces for this patch => merge on a single Pe
1586 
1587  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1588 
1589  // Message that goes out to the compute
1590  PmeForceMsg *forceMsg = homePatchForceMsgs[forceI][homePatchIndex];
1591 
1592  if (pmePencilType == 3) {
1593  // 3D box => simple memory copy will do
1594  // Location of forces in the force[] array
1595  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1596  // plList[homePatchIndex] array tells you the location of pencils that are sharing this home patch
1597  int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1598  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1599  int atomEnd = patchPos[pencilPatchIndex];
1600  int numAtoms = atomEnd-atomStart;
1601  if (forceMsg->zeroCopy) {
1602  // Zero-copy, just pass the pointer
1603  forceMsg->force = force+atomStart;
1604  } else {
1605  memcpy(forceMsg->force, force+atomStart, numAtoms*sizeof(CudaForce));
1606  }
1607  } else {
1608 
1609  // Zero force array
1610  // memset(forceMsg->force, 0, numHomeAtoms[forceI][homePatchIndex]*sizeof(CudaForce));
1611  memset(forceMsg->force, 0, forceMsg->numAtoms*sizeof(CudaForce));
1612 
1613  // Store forces from primary pencil
1614  {
1615  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1616  int* index = pmeAtomStorage[forceI][pp0]->getAtomIndex();
1617  int pencilPatchIndex = plList[forceI][homePatchIndex][0].pencilPatchIndex;
1618  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1619  int atomEnd = patchPos[pencilPatchIndex];
1620  int numAtoms = atomEnd-atomStart;
1621 
1622  // Copy in local forces that are stored in the force[] array
1623  for (int i=0;i < numAtoms;i++) {
1624  forceMsg->force[index[atomStart + i]] = force[atomStart + i];
1625  }
1626 
1627  }
1628 
1629  // Add forces from neighboring pencils
1630  for (int j=1;j < plList[forceI][homePatchIndex].size();j++) {
1631  int pp = plList[forceI][homePatchIndex][j].pp;
1632  int pencilPatchIndex = plList[forceI][homePatchIndex][j].pencilPatchIndex;
1633 
1634  int* patchPos = pmeAtomStorage[forceI][pp]->getPatchPos();
1635  int* index = pmeAtomStorage[forceI][pp]->getAtomIndex();
1636  int atomStart = (pencilPatchIndex == 0) ? 0 : patchPos[pencilPatchIndex-1];
1637  int atomEnd = patchPos[pencilPatchIndex];
1638  int numAtoms = atomEnd-atomStart;
1639  CudaForce *dstForce = forceMsg->force;
1640  // CudaForce *srcForce = neighborForcePencils[pp].force;
1641  CudaForce *srcForce = neighborForcePencilMsgs[pp]->force;
1642 
1643  for (int i=0;i < numAtoms;i++) {
1644  dstForce[index[atomStart + i]].x += srcForce[atomStart + i].x;
1645  dstForce[index[atomStart + i]].y += srcForce[atomStart + i].y;
1646  dstForce[index[atomStart + i]].z += srcForce[atomStart + i].z;
1647  }
1648 
1649  }
1650  }
1651 
1652  // Clear storage
1653  plList[forceI][homePatchIndex].clear();
1654 
1655  // ----------------------------- lock start ---------------------------
1656  // bool done = false;
1657  CmiLock(lock_numHomePatchesMerged);
1658  numHomePatchesMerged++;
1659  if (numHomePatchesMerged == numHomePatches) {
1660  // Reset counter
1661  numHomePatchesMerged = 0;
1662 
1663  // Delete messages
1664  for (int i=0;i < neighborForcePencilMsgs.size();i++) {
1665  if (neighborForcePencilMsgs[i] != NULL) {
1666  delete neighborForcePencilMsgs[i];
1667  neighborForcePencilMsgs[i] = NULL;
1668  }
1669  }
1670 
1671  // Done merging and sending forces => clear storage
1672  for (int pp=0;pp < homePatchIndexList[forceI].size();pp++)
1673  homePatchIndexList[forceI][pp].clear();
1674  for (int pp=0;pp < pmeAtomStorage[forceI].size();pp++)
1675  pmeAtomStorage[forceI][pp]->clear();
1676 
1677  }
1678  CmiUnlock(lock_numHomePatchesMerged);
1679  // ----------------------------- lock end ---------------------------
1680 
1681  // Patch is done => send over to the node that contains the ComputePmeCUDA compute,
1682  // this node will then rely the message to the Pe that originally sent the atoms
1683  int pe = forceMsg->pe;
1684  if (CkNodeOf(pe) != CkMyNode())
1685  thisProxy[CkNodeOf(pe)].sendForcesToPatch(forceMsg);
1686  else
1687  sendForcesToPatch(forceMsg);
1688 
1689 }
void sendForcesToPatch(PmeForceMsg *forceMsg)
float x
Definition: NamdTypes.h:158
float z
Definition: NamdTypes.h:158
float y
Definition: NamdTypes.h:158
void ComputePmeCUDADevice::recvAtoms ( PmeAtomMsg msg)

Definition at line 1148 of file ComputePmeCUDAMgr.C.

References PmeAtomMsg::atoms, PmeAtomMsg::compute, PmeForceMsg::compute, PmeAtomMsg::doEnergy, PmeAtomMsg::doVirial, PmeAtomFiler::fileAtoms(), PmeAtomFiler::getAtomIndex(), PmeAtomFiler::getNumAtoms(), PmeAtomMsg::lattice, NAMD_bug(), PmeAtomMsg::numAtoms, PmeForceMsg::numAtoms, PmeForceMsg::numStrayAtoms, PmeAtomMsg::pe, PmeForceMsg::pe, PRIORITY_SIZE, sendAtomsToNeighbors(), CudaAtom::x, CudaAtom::y, y, PmeGrid::yBlocks, CudaAtom::z, z, PmeGrid::zBlocks, and PmeForceMsg::zeroCopy.

1148  {
1149 
1150  PmeAtomFiler *pmeAtomFilerPtr = pmeAtomFiler[CkMyPe()].ckLocalBranch();
1151  // Store "virial" and "energy" flags
1152  doVirial = msg->doVirial;
1153  doEnergy = msg->doEnergy;
1154  // Store lattice
1155  lattice = msg->lattice;
1156 
1157  // Primary pencil index
1158  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1159  int p0 = 0;
1160  int pencilPatchIndex[9];
1161  int numStrayAtomsPatch = 0;
1162  if (pmePencilType == 3) {
1163  // 3D box => store atoms directly without index
1164  // NOTE: We don't check for stray atoms here!
1165  pencilPatchIndex[p0] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
1166  } else {
1167 
1168  // File atoms
1169  pmeAtomFilerPtr->fileAtoms(msg->numAtoms, msg->atoms, lattice, pmeGrid,
1170  pencilIndexY, pencilIndexZ, ylo, yhi, zlo, zhi);
1171 
1172  // Loop through pencils and add atoms to pencil atom lists
1173  // NOTE: we only store to neighboring pencil if there are atoms to store
1174  int numAtomsCheck = 0;
1175  for (int p=0;p < 9;p++) {
1176 
1177  int y = (p % 3);
1178  int z = (p / 3);
1179 
1180  int pp = y + z*yNBlocks;
1181  int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
1182  if (pp == pp0) p0 = p;
1183  if (pp == pp0 || numAtoms > 0) {
1184  if (pmeGrid.yBlocks == 1 && pmeGrid.zBlocks == 1 && (y != 0 || z != 0))
1185  NAMD_bug("ComputePmeCUDADevice::recvAtoms, problem with atom filing");
1186  int* index = pmeAtomFilerPtr->getAtomIndex(p);
1187  pencilPatchIndex[p] = pmeAtomStorage[atomI][pp]->addAtomsWithIndex(numAtoms, msg->atoms, index);
1188  // Number of patches in this storage tells you how many home patches contributed and
1189  // homePatchIndex (pe) tells you which patch contributed
1190  numAtomsCheck += numAtoms;
1191  }
1192  }
1193 
1194  // Deal with stray atoms
1195  numStrayAtomsPatch = pmeAtomFilerPtr->getNumAtoms(9);
1196  if (numStrayAtomsPatch > 0) {
1197  int* index = pmeAtomFilerPtr->getAtomIndex(9);
1198  CkPrintf("%d stray charges detected. Up to 10 listed below (index in patch, x, y, z):\n", numStrayAtomsPatch);
1199  for (int i=0;i < std::min(numStrayAtomsPatch, 10);i++) {
1200  int j = index[i];
1201  CkPrintf("%d %f %f %f\n", j, msg->atoms[j].x, msg->atoms[j].y, msg->atoms[j].z);
1202  }
1203  }
1204 
1205  if (numAtomsCheck + numStrayAtomsPatch < msg->numAtoms)
1206  NAMD_bug("ComputePmeCUDADevice::recvAtoms, missing atoms");
1207  }
1208 
1209  // Create storage for home patch forces
1210  PmeForceMsg *forceMsg;
1211  if (pmePencilType == 3 && CkNodeOf(msg->pe) == CkMyNode()) {
1212  // 3D FFT and compute resides on the same node => use zero-copy forces
1213  forceMsg = new (0, PRIORITY_SIZE) PmeForceMsg();
1214  forceMsg->zeroCopy = true;
1215  } else {
1216  forceMsg = new (msg->numAtoms, PRIORITY_SIZE) PmeForceMsg();
1217  forceMsg->zeroCopy = false;
1218  }
1219  forceMsg->numAtoms = msg->numAtoms;
1220  forceMsg->pe = msg->pe;
1221  forceMsg->compute = msg->compute;
1222  forceMsg->numStrayAtoms = numStrayAtomsPatch;
1223 
1224  bool done = false;
1225  // ----------------------------- lock start ---------------------------
1226  // Only after writing has finished, we get homePatchIndex
1227  // This quarantees that for whatever thread that receives "done=true", writing has finished on
1228  // ALL threads.
1229  CmiLock(lock_recvAtoms);
1230  numStrayAtoms += numStrayAtomsPatch;
1231  // Secure homePatchIndex. All writes after this must be inside lock-region
1232  int homePatchIndex = numHomePatchesRecv;
1233  // Store primary pencil first
1234  plList[atomI][homePatchIndex].push_back(PencilLocation(pp0, pencilPatchIndex[p0]));
1235  if (pmePencilType != 3) {
1236  // Go back to through neighboring pencils and store "homePatchIndex"
1237  for (int p=0;p < 9;p++) {
1238 
1239  int y = (p % 3);
1240  int z = (p / 3);
1241 
1242  int pp = y + z*yNBlocks;
1243  int numAtoms = pmeAtomFilerPtr->getNumAtoms(p);
1244  if (pp != pp0 && numAtoms > 0) {
1245  homePatchIndexList[atomI][pp].push_back(homePatchIndex);
1246  // plList[0...numHomePatches-1] = for each home patch stores the location of pencils that are
1247  // sharing it
1248  // plList[homePatchIndex].size() tells the number of pencils that the home patch is shared with
1249  plList[atomI][homePatchIndex].push_back(PencilLocation(pp, pencilPatchIndex[p]));
1250  }
1251  }
1252  }
1253  homePatchForceMsgs[atomI][homePatchIndex] = forceMsg;
1254  // numHomeAtoms[atomI][homePatchIndex] = msg->numAtoms;
1255  // Set the number of pencils contributing to this home patch
1256  numPencils[atomI][homePatchIndex] = plList[atomI][homePatchIndex].size();
1257  //
1258  numHomePatchesRecv++;
1259  if (numHomePatchesRecv == numHomePatches) {
1260  // Reset counter
1261  numHomePatchesRecv = 0;
1262  done = true;
1263  }
1264  CmiUnlock(lock_recvAtoms);
1265  // ----------------------------- lock end ---------------------------
1266 
1267  // plList[atomI][homePatchIndex] array tells you the location of pencils that are sharing this home patch
1268 
1269  delete msg;
1270 
1271  if (done) {
1272  // Pencil has received all home patches and writing to memory is done => send atoms to neighbors
1274  }
1275 }
int zBlocks
Definition: PmeBase.h:22
ComputePmeCUDA * compute
float z
Definition: NamdTypes.h:154
float x
Definition: NamdTypes.h:154
void fileAtoms(const int numAtoms, const CudaAtom *atoms, Lattice &lattice, const PmeGrid &pmeGrid, const int pencilIndexY, const int pencilIndexZ, const int ylo, const int yhi, const int zlo, const int zhi)
int getNumAtoms(int p)
int yBlocks
Definition: PmeBase.h:22
#define PRIORITY_SIZE
Definition: Priorities.h:13
void NAMD_bug(const char *err_msg)
Definition: common.C:129
CudaAtom * atoms
gridSize z
float y
Definition: NamdTypes.h:154
gridSize y
ComputePmeCUDA * compute
int * getAtomIndex(int p)
void ComputePmeCUDADevice::recvAtomsFromNeighbor ( PmeAtomPencilMsg msg)

Definition at line 1323 of file ComputePmeCUDAMgr.C.

References PmeAtomPencilMsg::atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, PmeAtomPencilMsg::lattice, NAMD_bug(), PmeAtomPencilMsg::numAtoms, registerRecvAtomsFromNeighbor(), PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

1323  {
1324  // Store into primary pencil
1325  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1326  // Compute pencil index relative to primary pencil
1327  int y = msg->srcY - pencilIndexY;
1328  if (y < ylo) y += pmeGrid.yBlocks;
1329  if (y > yhi) y -= pmeGrid.yBlocks;
1330  int z = msg->srcZ - pencilIndexZ;
1331  if (z < zlo) z += pmeGrid.zBlocks;
1332  if (z > zhi) z -= pmeGrid.zBlocks;
1333  if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1334  NAMD_bug("ComputePmeCUDADevice::recvAtomsFromNeighbor, pencil index outside bounds");
1335  }
1336  // Read energy and virial flags
1337  doEnergy = msg->doEnergy;
1338  doVirial = msg->doVirial;
1339  // Read lattice
1340  lattice = msg->lattice;
1341  // Pencil index where atoms came from
1342  int pp = y-ylo + (z-zlo)*yNBlocks;
1343  // Store atoms and mark down the patch index where these atoms were added
1344  neighborPatchIndex[pp] = pmeAtomStorage[atomI][pp0]->addAtoms(msg->numAtoms, msg->atoms);
1345 
1346  delete msg;
1347 
1349 }
int zBlocks
Definition: PmeBase.h:22
int yBlocks
Definition: PmeBase.h:22
void NAMD_bug(const char *err_msg)
Definition: common.C:129
gridSize z
gridSize y
void ComputePmeCUDADevice::recvForcesFromNeighbor ( PmeForcePencilMsg msg)

Definition at line 1529 of file ComputePmeCUDAMgr.C.

References NAMD_bug(), numPatches, PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, y, PmeGrid::yBlocks, z, and PmeGrid::zBlocks.

1529  {
1530 
1531  // Source pencil index
1532  int y = msg->srcY - pencilIndexY;
1533  if (y < ylo) y += pmeGrid.yBlocks;
1534  if (y > yhi) y -= pmeGrid.yBlocks;
1535  int z = msg->srcZ - pencilIndexZ;
1536  if (z < zlo) z += pmeGrid.zBlocks;
1537  if (z > zhi) z -= pmeGrid.zBlocks;
1538 
1539  if (y < ylo || y > yhi || z < zlo || z > zhi || (y == 0 && z == 0)) {
1540  NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, pencil index outside bounds");
1541  }
1542 
1543  // Source pencil
1544  int pp = y-ylo + (z-zlo)*yNBlocks;
1545 
1546  // Store message (deleted in mergeForcesOnPatch)
1547  neighborForcePencilMsgs[pp] = msg;
1548 
1549  // neighborForcePencils[pp].force = new CudaForce[msg->numAtoms];
1550  // memcpy(neighborForcePencils[pp].force, msg->force, sizeof(CudaForce)*msg->numAtoms);
1551  // neighborForcePencils[pp].numAtoms = msg->numAtoms;
1552  // neighborForcePencils[pp].y = msg->y;
1553  // neighborForcePencils[pp].z = msg->z;
1554  // neighborForcePencils[pp].srcY = msg->srcY;
1555  // neighborForcePencils[pp].srcZ = msg->srcZ;
1556  // delete msg;
1557 
1558  // numPatches = number of home patches this pencil has
1559  int numPatches = pmeAtomStorage[forceI][pp]->getNumPatches();
1560  if (numPatches != homePatchIndexList[forceI][pp].size()) {
1561  NAMD_bug("ComputePmeCUDADevice::recvForcesFromNeighbor, numPatches incorrect");
1562  }
1563  for (int i=0;i < numPatches;i++) {
1564  // this pencil contributed to home patch with index "homePatchIndex"
1565  int homePatchIndex = homePatchIndexList[forceI][pp][i];
1566  // ----------------------------- lock start ---------------------------
1567  // NOTE: We use node-wide lock here for the entire numPencils[] array, while
1568  // we really would only need to each element but this would required
1569  // numHomePatches number of locks.
1570  bool done = false;
1571  CmiLock(lock_numPencils);
1572  numPencils[forceI][homePatchIndex]--;
1573  if (numPencils[forceI][homePatchIndex] == 0) done = true;
1574  CmiUnlock(lock_numPencils);
1575  // ----------------------------- lock end ---------------------------
1576  if (done) {
1577  // This home patch is done, launch force merging
1578  thisProxy[CkMyNode()].mergeForcesOnPatch(homePatchIndex);
1579  }
1580  }
1581 
1582 }
int zBlocks
Definition: PmeBase.h:22
int yBlocks
Definition: PmeBase.h:22
void NAMD_bug(const char *err_msg)
Definition: common.C:129
gridSize z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
gridSize y
void ComputePmeCUDADevice::registerNeighbor ( )

Definition at line 1139 of file ComputePmeCUDAMgr.C.

1139  {
1140  CmiLock(lock_numHomePatchesMerged);
1141  numNeighborsExpected++;
1142  CmiUnlock(lock_numHomePatchesMerged);
1143 }
void ComputePmeCUDADevice::registerRecvAtomsFromNeighbor ( )

Definition at line 1351 of file ComputePmeCUDAMgr.C.

References spreadCharge().

Referenced by recvAtomsFromNeighbor(), and sendAtomsToNeighbors().

1351  {
1352  // Primary pencil
1353  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1354 
1355  bool done = false;
1356  // ----------------------------- lock start ---------------------------
1357  CmiLock(lock_numNeighborsRecv);
1358  numNeighborsRecv++;
1359  if (numNeighborsRecv == numNeighborsExpected) {
1360  // Reset counter
1361  numNeighborsRecv = 0;
1362  done = true;
1363  }
1364  CmiUnlock(lock_numNeighborsRecv);
1365  // ----------------------------- lock end ---------------------------
1366 
1367  if (done) {
1368  // Primary pencil has received all atoms and writing has finished => spread charge
1369  spreadCharge();
1370  }
1371 }
void ComputePmeCUDADevice::sendAtomsToNeighbor ( int  y,
int  z,
int  atomIval 
)

Definition at line 1295 of file ComputePmeCUDAMgr.C.

References atoms, PmeAtomPencilMsg::atoms, PmeAtomPencilMsg::doEnergy, PmeAtomPencilMsg::doVirial, PmeAtomPencilMsg::lattice, PmeAtomPencilMsg::numAtoms, PRIORITY_SIZE, PmeAtomPencilMsg::srcY, PmeAtomPencilMsg::srcZ, PmeAtomPencilMsg::y, PmeGrid::yBlocks, PmeAtomPencilMsg::z, and PmeGrid::zBlocks.

1295  {
1296  // Pencil index
1297  int pp = y-ylo + (z-zlo)*yNBlocks;
1298  // This neighbor pencil is done, finish it up before accessing it
1299  pmeAtomStorage[atomIval][pp]->finish();
1300  // Compute destination neighbor pencil index (yt,zt)
1301  int yt = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1302  int zt = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1303  int numAtoms = pmeAtomStorage[atomIval][pp]->getNumAtoms();
1304  CudaAtom* atoms = pmeAtomStorage[atomIval][pp]->getAtoms();
1305  PmeAtomPencilMsg* msgPencil = new (numAtoms, PRIORITY_SIZE) PmeAtomPencilMsg;
1306  memcpy(msgPencil->atoms, atoms, numAtoms*sizeof(CudaAtom));
1307  msgPencil->numAtoms = numAtoms;
1308  // Store destination pencil index
1309  msgPencil->y = yt;
1310  msgPencil->z = zt;
1311  // Store source pencil index
1312  msgPencil->srcY = pencilIndexY;
1313  msgPencil->srcZ = pencilIndexZ;
1314  // Store energy and virial flags
1315  msgPencil->doEnergy = doEnergy;
1316  msgPencil->doVirial = doVirial;
1317  // Store lattice
1318  msgPencil->lattice = lattice;
1319  int node = mgrProxy.ckLocalBranch()->getNode(yt, zt);
1320  mgrProxy[node].recvAtomsFromNeighbor(msgPencil);
1321 }
int zBlocks
Definition: PmeBase.h:22
static __thread atom * atoms
int yBlocks
Definition: PmeBase.h:22
#define PRIORITY_SIZE
Definition: Priorities.h:13
gridSize z
gridSize y
void ComputePmeCUDADevice::sendAtomsToNeighbors ( )

Definition at line 1280 of file ComputePmeCUDAMgr.C.

References registerRecvAtomsFromNeighbor(), y, and z.

Referenced by recvAtoms().

1280  {
1281  for (int z=zlo;z <= zhi;z++) {
1282  for (int y=ylo;y <= yhi;y++) {
1283  // Only send to neighbors, not self
1284  if (y != 0 || z != 0) {
1285  // NOTE: Must send atomI -value since this will change in spreadCharge(), which might occur
1286  // before these sends have been performed
1287  thisProxy[CkMyNode()].sendAtomsToNeighbor(y, z, atomI);
1288  }
1289  }
1290  }
1291  // Register primary pencil
1293 }
gridSize z
gridSize y
void ComputePmeCUDADevice::sendForcesToNeighbors ( )

Definition at line 1496 of file ComputePmeCUDAMgr.C.

References PmeForcePencilMsg::force, PmeForcePencilMsg::numAtoms, PRIORITY_SIZE, PmeForcePencilMsg::srcY, PmeForcePencilMsg::srcZ, PmeForcePencilMsg::y, y, PmeGrid::yBlocks, PmeForcePencilMsg::z, z, and PmeGrid::zBlocks.

Referenced by gatherForceDone().

1496  {
1497  // Primary pencil has the forces
1498  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1499  int* patchPos = pmeAtomStorage[forceI][pp0]->getPatchPos();
1500  // Loop through neighboring pencils
1501  for (int z=zlo;z <= zhi;z++) {
1502  for (int y=ylo;y <= yhi;y++) {
1503  // Only send to neighbors, not self
1504  if (y != 0 || z != 0) {
1505  int pp = y-ylo + (z-zlo)*yNBlocks;
1506  int patchIndex = neighborPatchIndex[pp];
1507  int atomStart = (patchIndex == 0) ? 0 : patchPos[patchIndex-1];
1508  int atomEnd = patchPos[patchIndex];
1509  int natom = atomEnd-atomStart;
1510  // copy forces
1512  msg->numAtoms = natom;
1513  memcpy(msg->force, force+atomStart, natom*sizeof(CudaForce));
1514  // Calculate destination pencil index (dstY, dstZ) for this neighbor
1515  int dstY = (pencilIndexY + y + pmeGrid.yBlocks) % pmeGrid.yBlocks;
1516  int dstZ = (pencilIndexZ + z + pmeGrid.zBlocks) % pmeGrid.zBlocks;
1517  int node = mgrProxy.ckLocalBranch()->getNode(dstY, dstZ);
1518  msg->y = dstY;
1519  msg->z = dstZ;
1520  // Store source pencil index
1521  msg->srcY = pencilIndexY;
1522  msg->srcZ = pencilIndexZ;
1523  mgrProxy[node].recvForcesFromNeighbor(msg);
1524  }
1525  }
1526  }
1527 }
int zBlocks
Definition: PmeBase.h:22
int yBlocks
Definition: PmeBase.h:22
#define PRIORITY_SIZE
Definition: Priorities.h:13
gridSize z
gridSize y
void ComputePmeCUDADevice::sendForcesToPatch ( PmeForceMsg forceMsg)

Definition at line 1691 of file ComputePmeCUDAMgr.C.

References PmeForceMsg::compute, Compute::localWorkMsg, PmeForceMsg::pe, and ComputePmeCUDA::storePmeForceMsg().

Referenced by mergeForcesOnPatch().

1691  {
1692  // Now we're on the node that has Pe, hence "compute" -pointer is valid
1693  int pe = forceMsg->pe;
1694  ComputePmeCUDA *compute = forceMsg->compute;
1695 
1696  // Store message for use in ComputePmeCUDA, where it'll also be deleted.
1697  if (compute->storePmeForceMsg(forceMsg)) {
1698  // Enqueue on the pe that sent the atoms in the first place
1699  LocalWorkMsg *lmsg = compute->localWorkMsg;
1700  CProxy_WorkDistrib wdProxy(CkpvAccess(BOCclass_group).workDistrib);
1701  wdProxy[pe].enqueuePme(lmsg);
1702  }
1703 }
ComputePmeCUDA * compute
LocalWorkMsg *const localWorkMsg
Definition: Compute.h:46
bool storePmeForceMsg(PmeForceMsg *msg)
void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXYZ  pmePencilXYZ_in)

Definition at line 1079 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

1079  {
1080  if (pmePencilType != 3)
1081  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(1), invalid pmePencilType");
1082  pmePencilXYZ = pmePencilXYZ_in;
1083 }
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilXY  pmePencilXY_in)

Definition at line 1085 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

1085  {
1086  if (pmePencilType != 2)
1087  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(2), invalid pmePencilType");
1088  pmePencilXY = pmePencilXY_in;
1089 }
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void ComputePmeCUDADevice::setPencilProxy ( CProxy_CudaPmePencilX  pmePencilX_in)

Definition at line 1091 of file ComputePmeCUDAMgr.C.

References NAMD_bug().

1091  {
1092  if (pmePencilType != 1)
1093  NAMD_bug("ComputePmeCUDADevice::setPencilProxy(3), invalid pmePencilType");
1094  pmePencilX = pmePencilX_in;
1095 }
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void ComputePmeCUDADevice::spreadCharge ( )

Definition at line 1373 of file ComputePmeCUDAMgr.C.

References atoms, PmeRealSpaceCompute::copyAtoms(), PmeRunMsg::doEnergy, PmeRunMsg::doVirial, PmeRunMsg::lattice, PmeRunMsg::numStrayAtoms, PmeRealSpaceCompute::spreadCharge(), and msm::swap().

Referenced by registerRecvAtomsFromNeighbor().

1373  {
1374  // Spread charges in primary pencil
1375  int pp0 = 0-ylo + (0-zlo)*yNBlocks;
1376  // Primary pencil is done, finish it up before accessing it
1377  // (clearing is done in mergeForcesOnPatch)
1378  pmeAtomStorage[atomI][pp0]->finish();
1379  // Get the number of atoms and pointer to atoms
1380  int numAtoms = pmeAtomStorage[atomI][pp0]->getNumAtoms();
1381  CudaAtom* atoms = pmeAtomStorage[atomI][pp0]->getAtoms();
1382  // Flip atomI <-> forceI
1383  std::swap(atomI, forceI);
1384  // Re-allocate force buffer if needed
1385  reallocate_host<CudaForce>(&force, &forceCapacity, numAtoms, 1.5f);
1386  // (already have the updated lattice)
1387  pmeRealSpaceCompute->copyAtoms(numAtoms, atoms);
1388  // Spread charge
1389  beforeWalltime = CmiWallTimer();
1390  pmeRealSpaceCompute->spreadCharge(lattice);
1391  // Send "charge grid ready to PME solver"
1392  PmeRunMsg *pmeRunMsg = new PmeRunMsg();
1393  pmeRunMsg->doVirial = doVirial;
1394  pmeRunMsg->doEnergy = doEnergy;
1395  pmeRunMsg->lattice = lattice;
1396  pmeRunMsg->numStrayAtoms = numStrayAtoms;
1397  // Reset stray atom counter
1398  numStrayAtoms = 0;
1399  switch(pmePencilType) {
1400  case 1:
1401  pmePencilX(0, pencilIndexY, pencilIndexZ).chargeGridReady(pmeRunMsg);
1402  break;
1403  case 2:
1404  pmePencilXY(0, 0, pencilIndexZ).chargeGridReady(pmeRunMsg);
1405  break;
1406  case 3:
1407  pmePencilXYZ[0].chargeGridReady(pmeRunMsg);
1408  break;
1409  }
1410 }
int numStrayAtoms
Definition: PmeSolver.h:111
bool doVirial
Definition: PmeSolver.h:110
static __thread atom * atoms
void swap(Array< T > &s, Array< T > &t)
Definition: MsmMap.h:319
Lattice lattice
Definition: PmeSolver.h:112
virtual void spreadCharge(Lattice &lattice)=0
bool doEnergy
Definition: PmeSolver.h:110
virtual void copyAtoms(const int numAtoms, const CudaAtom *atoms)=0

The documentation for this class was generated from the following files: