CudaComputeNonbonded Class Reference

#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:

Compute ComputeNonbondedUtil List of all members.

Public Member Functions

 CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 ~CudaComputeNonbonded ()
void registerComputeSelf (ComputeID cid, PatchID pid)
void registerComputePair (ComputeID cid, PatchID *pid, int *trans)
void assignPatches (ComputeMgr *computeMgrIn)
virtual void initialize ()
virtual void atomUpdate ()
virtual int noWork ()
virtual void doWork ()
void launchWork ()
void finishReductions ()
void unregisterBoxesOnPe ()
void assignPatchesOnPe ()
void openBoxesOnPe ()
void skipPatchesOnPe ()
void finishPatchesOnPe ()
void finishPatchOnPe (int i)
void messageEnqueueWork ()
virtual void patchReady (PatchID, int doneMigration, int seq)
virtual void gbisP2PatchReady (PatchID, int seq)
virtual void gbisP3PatchReady (PatchID, int seq)

Classes

struct  ComputeRecord
struct  PatchRecord

Detailed Description

Definition at line 21 of file CudaComputeNonbonded.h.


Constructor & Destructor Documentation

CudaComputeNonbonded::CudaComputeNonbonded ( ComputeID  c,
int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 36 of file CudaComputeNonbonded.C.

References cudaCheck, Compute::gbisPhase, NAMD_die(), Node::Object(), Node::simParameters, and simParams.

00037                                                               : 
00038 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
00039 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
00040 
00041   cudaCheck(cudaSetDevice(deviceID));
00042 
00043         exclusionsByAtom = NULL;
00044 
00045   vdwTypes = NULL;
00046   vdwTypesSize = 0;
00047 
00048   exclIndexMaxDiff = NULL;
00049   exclIndexMaxDiffSize = 0;
00050 
00051   atomIndex = NULL;
00052   atomIndexSize = 0;
00053 
00054   atomStorageSize = 0;
00055 
00056   // Atom and charge storage
00057   atoms = NULL;
00058   atomsSize = 0;
00059 
00060   // Force storage
00061   h_forces = NULL;
00062   h_forcesSize = 0;
00063   h_forcesSlow = NULL;
00064   h_forcesSlowSize = 0;
00065 
00066   d_forces = NULL;
00067   d_forcesSize = 0;
00068   d_forcesSlow = NULL;
00069   d_forcesSlowSize = 0;
00070 
00071   // GBIS
00072   intRad0H = NULL;
00073   intRad0HSize = 0;
00074   intRadSH = NULL;
00075   intRadSHSize = 0;
00076   psiSumH = NULL;
00077   psiSumHSize = 0;
00078   bornRadH = NULL;
00079   bornRadHSize = 0;
00080   dEdaSumH = NULL;
00081   dEdaSumHSize = 0;
00082   dHdrPrefixH = NULL;
00083   dHdrPrefixHSize = 0;
00084 
00085   cudaPatches = NULL;
00086 
00087   atomsChangedIn = true;
00088   atomsChanged = true;
00089   computesChanged = true;
00090 
00091   forceDoneEventRecord = false;
00092 
00093   SimParameters *simParams = Node::Object()->simParameters;
00094   if (simParams->pressureProfileOn) {
00095     NAMD_die("CudaComputeNonbonded, pressure profile not supported");
00096   }
00097 
00098   if (simParams->GBISOn) gbisPhase = 3;
00099 
00100   doSkip = false;
00101 }

CudaComputeNonbonded::~CudaComputeNonbonded (  ) 

Definition at line 106 of file CudaComputeNonbonded.C.

References cudaCheck, and ComputeMgr::sendUnregisterBoxesOnPe().

00106                                             {
00107   cudaCheck(cudaSetDevice(deviceID));
00108         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
00109   if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
00110   if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
00111   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
00112   if (h_forces != NULL) deallocate_host<float4>(&h_forces);
00113   if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
00114   if (d_forces != NULL) deallocate_device<float4>(&d_forces);
00115   if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
00116 
00117   // GBIS
00118   if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
00119   if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
00120   if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
00121   if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
00122   if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
00123   if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
00124 
00125   if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
00126 
00127   if (patches.size() > 0) {
00128     deallocate_host<VirialEnergy>(&h_virialEnergy);
00129     deallocate_device<VirialEnergy>(&d_virialEnergy);
00130     cudaCheck(cudaStreamDestroy(stream));
00131     cudaCheck(cudaEventDestroy(forceDoneEvent));
00132     CmiDestroyLock(lock);
00133     delete reduction;
00134   }
00135 
00136   // NOTE: unregistering happens in [sync] -entry method
00137   computeMgr->sendUnregisterBoxesOnPe(pes, this);
00138 
00139 }


Member Function Documentation

void CudaComputeNonbonded::assignPatches ( ComputeMgr computeMgrIn  ) 

Definition at line 363 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), j, NAMD_bug(), ComputePmeCUDAMgr::Object(), PatchMap::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), and Compute::setNumPatches().

Referenced by ComputeMgr::createComputes().

00363                                                                  {
00364   // Remove duplicate patches
00365   std::sort(patches.begin(), patches.end());
00366   std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
00367   patches.erase(last, patches.end());
00368   // Set number of patches
00369   setNumPatches(patches.size());
00370   masterPe = CkMyPe();
00371   computeMgr = computeMgrIn;
00372   // Start patch counter
00373   patchesCounter = getNumPatches();
00374   // Patch ID map
00375   std::map<PatchID, int> pidMap;
00376 #if 1
00377   //-------------------------------------------------------
00378   // Copied in from ComputeNonbondedCUDA::assignPatches()
00379   //-------------------------------------------------------
00380 
00381   std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
00382   int numPesOnNodeSharingDevice = 0;
00383   int masterIndex = -1;
00384   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00385     int pe = deviceCUDA->getPesSharingDevice(i);
00386     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00387     if ( CkNodeOf(pe) == CkMyNode() ) {
00388       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00389     }
00390   }
00391 
00392   std::vector<int> count(patches.size(), 0);
00393   std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
00394   std::vector<int> rankpcount(CkMyNodeSize(), 0);
00395   std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
00396 
00397   PatchMap* patchMap = PatchMap::Object();
00398 
00399   int unassignedpatches = patches.size();
00400 
00401   for (int i=0;i < patches.size(); ++i) {
00402     patches[i].pe = -1;
00403   }
00404 
00405   // assign if home pe and build table of natural proxies
00406   for (int i=0;i < patches.size(); ++i) {
00407     int pid = patches[i].patchID;
00408     // homePe = PE where the patch currently resides
00409     int homePe = patchMap->node(pid);
00410     for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
00411       int pe = pesOnNodeSharingDevice[j];
00412       // If homePe is sharing this device, assign this patch to homePe
00413       if ( pe == homePe ) {
00414         patches[i].pe = pe;
00415         --unassignedpatches;
00416         pcount[j] += 1;
00417       }
00418       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00419         table[i*numPesOnNodeSharingDevice + j] = 1;
00420       }
00421     }
00422     // Assign this patch to homePe, if it resides on the same node
00423     if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00424       patches[i].pe = homePe;
00425       --unassignedpatches;
00426       rankpcount[CkRankOf(homePe)] += 1;
00427     }
00428   }
00429   // assign if only one pe has a required proxy
00430   for (int i=0; i < patches.size(); ++i) {
00431     int pid = patches[i].patchID;
00432     if ( patches[i].pe != -1 ) continue;
00433     int c = 0;
00434     int lastj;
00435     for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
00436       if ( table[i*numPesOnNodeSharingDevice + j] ) {
00437         ++c;
00438         lastj = j;
00439       }
00440     }
00441     count[i] = c;
00442     if ( c == 1 ) {
00443       patches[i].pe = pesOnNodeSharingDevice[lastj];
00444       --unassignedpatches;
00445       pcount[lastj] += 1;
00446     }
00447   }
00448   int assignj = 0;
00449   while ( unassignedpatches ) {
00450     int i;
00451     for (i=0;i < patches.size(); ++i) {
00452       if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
00453       int pid = patches[i].patchID;
00454       // patch_record &pr = patchRecords[pid];
00455       if ( patches[i].pe != -1 ) continue;
00456       patches[i].pe = pesOnNodeSharingDevice[assignj];
00457       --unassignedpatches;
00458       pcount[assignj] += 1;
00459       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00460       break;
00461     }
00462     if (i < patches.size() ) continue;  // start search again
00463     for ( i=0;i < patches.size(); ++i ) {
00464       int pid = patches[i].patchID;
00465       // patch_record &pr = patchRecords[pid];
00466       if ( patches[i].pe != -1 ) continue;
00467       if ( count[i] ) continue;
00468       patches[i].pe = pesOnNodeSharingDevice[assignj];
00469       --unassignedpatches;
00470       pcount[assignj] += 1;
00471       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00472       break;
00473     }
00474     if ( i < patches.size() ) continue;  // start search again
00475     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00476   }
00477 
00478   // For each rank, list of patches
00479   rankPatches.resize(CkMyNodeSize());
00480   for (int i=0; i < patches.size(); ++i) {
00481     rankPatches[CkRankOf(patches[i].pe)].push_back(i);
00482     pidMap[patches[i].patchID] = i;
00483   }
00484 
00485   // for ( int i=0; i < patches.size(); ++i ) {
00486   //   CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
00487   // }
00488 
00489 /*
00490   slavePes = new int[CkMyNodeSize()];
00491   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00492   numSlaves = 0;
00493   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00494     int pe = pesOnNodeSharingDevice[j];
00495     int rank = pe - CkNodeFirst(CkMyNode());
00496     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00497     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00498     if ( pe == CkMyPe() ) continue;
00499     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00500     rankpcount[rank] = 0;  // skip in rank loop below
00501     slavePes[numSlaves] = pe;
00502     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00503     ++numSlaves;
00504   }
00505   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00506     int pe = CkNodeFirst(CkMyNode()) + j;
00507     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00508     //          CkMyPe(),j,pe,rankpcount[j]);
00509     if ( ! rankpcount[j] ) continue;
00510     if ( pe == CkMyPe() ) continue;
00511     slavePes[numSlaves] = pe;
00512     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00513     ++numSlaves;
00514   }
00515 */
00516 
00517 #else
00518   // For each rank, list of patches
00519   rankPatches.resize(CkMyNodeSize());
00520   // For each rank, list of home patch IDs
00521   PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
00522   for (int i=0;i < CkMyNodeSize();i++) {
00523     int pe = CkNodeFirst(CkMyNode()) + i;
00524     PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
00525   }
00526   std::vector<int> proxyPatchPes;
00527   std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
00528   //--------------------------------------------------------
00529   // Build a list of PEs to avoid
00530   std::vector<int> pesToAvoid;
00531 #if 0
00532   // Avoid other GPUs' master PEs
00533   for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
00534     int pe = deviceCUDA->getMasterPeForDeviceID(i);
00535     if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
00536   }
00537   // Avoid PEs that are involved in PME
00538   ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
00539   for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
00540     if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
00541   }
00542   // Set counters of avoidable PEs to high numbers
00543   for (int i=0;i < pesToAvoid.size();i++) {
00544     int pe = pesToAvoid[i];
00545     peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);    
00546   }
00547 #endif
00548   // Avoid master Pe somewhat
00549   peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
00550   //--------------------------------------------------------
00551   for (int i=0;i < patches.size();i++) {
00552     PatchID pid = patches[i].patchID;
00553     int pe = findHomePatchPe(rankHomePatchIDs, pid);
00554     if (pe == -1) {
00555       // Patch not present on this node => try finding a ProxyPatch
00556       findProxyPatchPes(proxyPatchPes, pid);
00557       if (proxyPatchPes.size() == 0) {
00558         // No ProxyPatch => create one on rank that has the least ProxyPatches
00559         int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00560         pe = CkNodeFirst(CkMyNode()) + rank;
00561         peProxyPatchCounter[rank]++;
00562       } else {
00563         // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
00564         // this is done by finding the entry with minimum peProxyPatchCounter -value
00565         // Find miniumum among proxyPatchPes, i.e., find the minimum among
00566         // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
00567         // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
00568         //   [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
00569         //   - proxyPatchPes.begin();
00570         // pe = proxyPatchPes[pppi];
00571         int minCounter = (1 << 30);
00572         for (int j=0;j < proxyPatchPes.size();j++) {
00573           if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
00574             pe = proxyPatchPes[j];
00575             minCounter = peProxyPatchCounter[CkRankOf(pe)];
00576           }
00577         }
00578         if (pe == -1)
00579           NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
00580         peProxyPatchCounter[CkRankOf(pe)]++;
00581       }
00582     } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
00583       // Found home patch on this node, but it's on PE that should be avoided => find a new one
00584       int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00585       pe = CkNodeFirst(CkMyNode()) + rank;
00586       peProxyPatchCounter[rank]++;
00587     }
00588     if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
00589       NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
00590     rankPatches[CkRankOf(pe)].push_back(i);
00591     pidMap[pid] = i;
00592   }
00593 
00594   delete [] rankHomePatchIDs;
00595 #endif
00596   // Setup computes using pidMap
00597   for (int i=0;i < computes.size();i++) {
00598     computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
00599     computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
00600   }
00601   for (int i=0;i < CkMyNodeSize();i++) {
00602     if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
00603   }
00604   computeMgr->sendAssignPatchesOnPe(pes, this);
00605 }

void CudaComputeNonbonded::assignPatchesOnPe (  ) 

Definition at line 301 of file CudaComputeNonbonded.C.

References ResizeArray< Elem >::add(), j, NAMD_bug(), PatchMap::node(), PatchMap::Object(), ResizeArray< Elem >::size(), and y.

Referenced by ComputeMgr::recvAssignPatchesOnPe().

00301                                              {
00302   if (rankPatches[CkMyRank()].size() == 0)
00303     NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
00304 
00305   // calculate priority rank of local home patch within pe
00306   {
00307     PatchMap* patchMap = PatchMap::Object();
00308     ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
00309     for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
00310       int i = rankPatches[CkMyRank()][k];
00311       int pid = patches[i].patchID;
00312       int homePe = patchMap->node(pid);
00313       if ( CkNodeOf(homePe) == CkMyNode() ) {
00314         int2 pid_index;
00315         pid_index.x = pid;
00316         pid_index.y = i;
00317         homePatchByRank[CkRankOf(homePe)].add(pid_index);
00318       }
00319     }
00320     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00321       pid_sortop_reverse_priority so;
00322       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00323       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00324       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00325         int index = homePatchByRank[i][j].y;
00326         patches[index].reversePriorityRankInPe = j + masterBoost;
00327       }
00328     }
00329   }
00330 
00331   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00332     assignPatch(rankPatches[CkMyRank()][i]);
00333   }
00334 }

void CudaComputeNonbonded::atomUpdate (  )  [virtual]

Reimplemented from Compute.

Definition at line 638 of file CudaComputeNonbonded.C.

00638                                       {
00639   atomsChangedIn = true;
00640 }

void CudaComputeNonbonded::doWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 914 of file CudaComputeNonbonded.C.

References Flags::doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, Flags::doVirial, Compute::gbisPhase, NAMD_bug(), Node::Object(), ComputeMgr::sendOpenBoxesOnPe(), Node::simParameters, and simParams.

00914                                   {
00915   if (CkMyPe() != masterPe)
00916     NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
00917 
00918   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
00919   // atomsChangedIn can be set to true by any Pe
00920   // atomsChanged can only be set by masterPe
00921   // This use of double varibles makes sure we don't have race condition
00922   atomsChanged = atomsChangedIn;
00923   atomsChangedIn = false;
00924 
00925   SimParameters *simParams = Node::Object()->simParameters;
00926 
00927   if (patches.size() == 0) return;  // No work do to
00928 
00929   // Take the flags from the first patch on this Pe
00930   // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
00931   Flags &flags = patches[0].patch->flags;
00932 
00933   doSlow = flags.doFullElectrostatics;
00934   doEnergy = flags.doEnergy;
00935   doVirial = flags.doVirial;
00936 
00937   if (flags.doNonbonded) {
00938 
00939     if (simParams->GBISOn) {
00940       gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
00941     }
00942 
00943     if (!simParams->GBISOn || gbisPhase == 1) {
00944       if ( computesChanged ) {
00945         updateComputes();
00946       }
00947       if (atomsChanged) {
00948         // Re-calculate patch atom numbers and storage
00949         updatePatches();
00950         reSortDone = false;
00951       }
00952       reallocateArrays();
00953     }
00954 
00955     // Open boxes on Pes and launch work to masterPe
00956     computeMgr->sendOpenBoxesOnPe(pes, this);
00957 
00958   } else {
00959     // No work to do, skip
00960     skip();
00961   }
00962 
00963 }

void CudaComputeNonbonded::finishPatchesOnPe (  ) 

Definition at line 1360 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchesOnPe().

01360                                              {
01361   finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
01362 }

void CudaComputeNonbonded::finishPatchOnPe ( int  i  ) 

Definition at line 1367 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

01367                                                 {
01368   std::vector<int> v(1, i);
01369   finishSetOfPatchesOnPe(v);
01370 }

void CudaComputeNonbonded::finishReductions (  ) 

Definition at line 1201 of file CudaComputeNonbonded.C.

References ADD_TENSOR_OBJECT, cudaCheck, VirialEnergy::energyElec, VirialEnergy::energyGBIS, VirialEnergy::energySlow, VirialEnergy::energyVdw, CudaTileListKernel::getNumExcluded(), SubmitReduction::item(), NAMD_bug(), Node::Object(), REDUCTION_COMPUTE_CHECKSUM, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, REDUCTION_VIRIAL_NBOND, REDUCTION_VIRIAL_SLOW, Node::simParameters, simParams, SubmitReduction::submit(), VirialEnergy::virial, VirialEnergy::virialSlow, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by ComputeMgr::recvFinishReductions().

01201                                             {
01202 
01203   if (CkMyPe() != masterPe)
01204     NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
01205   
01206   // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
01207 
01208   if (!doSkip) {
01209 
01210     if (doStreaming && (doVirial || doEnergy)) {
01211       // For streaming kernels, we must wait for virials and forces to be copied back to CPU
01212       if (!forceDoneEventRecord)
01213         NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
01214       cudaCheck(cudaEventSynchronize(forceDoneEvent));
01215       forceDoneEventRecord = false;
01216     }
01217 
01218     if (doVirial) {
01219       Tensor virialTensor;
01220       virialTensor.xx = h_virialEnergy->virial[0];
01221       virialTensor.xy = h_virialEnergy->virial[1];
01222       virialTensor.xz = h_virialEnergy->virial[2];
01223       virialTensor.yx = h_virialEnergy->virial[3];
01224       virialTensor.yy = h_virialEnergy->virial[4];
01225       virialTensor.yz = h_virialEnergy->virial[5];
01226       virialTensor.zx = h_virialEnergy->virial[6];
01227       virialTensor.zy = h_virialEnergy->virial[7];
01228       virialTensor.zz = h_virialEnergy->virial[8];
01229       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01230       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01231       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01232       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
01233       if (doSlow) {
01234         Tensor virialTensor;
01235         virialTensor.xx = h_virialEnergy->virialSlow[0];
01236         virialTensor.xy = h_virialEnergy->virialSlow[1];
01237         virialTensor.xz = h_virialEnergy->virialSlow[2];
01238         virialTensor.yx = h_virialEnergy->virialSlow[3];
01239         virialTensor.yy = h_virialEnergy->virialSlow[4];
01240         virialTensor.yz = h_virialEnergy->virialSlow[5];
01241         virialTensor.zx = h_virialEnergy->virialSlow[6];
01242         virialTensor.zy = h_virialEnergy->virialSlow[7];
01243         virialTensor.zz = h_virialEnergy->virialSlow[8];
01244         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01245         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01246         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01247         ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
01248       }
01249     }
01250     if (doEnergy) {
01251       // if (doSlow)
01252       //   printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
01253       SimParameters *simParams = Node::Object()->simParameters;
01254       reduction->item(REDUCTION_LJ_ENERGY)    += h_virialEnergy->energyVdw;
01255       reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
01256       // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
01257       if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
01258       // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
01259     }
01260 
01261     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
01262   }
01263   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
01264   reduction->submit();
01265 
01266   // Reset flags
01267   doSkip = false;
01268   computesChanged = false;
01269 }

void CudaComputeNonbonded::gbisP2PatchReady ( PatchID  ,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 244 of file CudaComputeNonbonded.C.

References Compute::gbisP2PatchReady().

00244                                                                 {
00245   CmiLock(lock);
00246   Compute::gbisP2PatchReady(pid, seq);
00247   CmiUnlock(lock);
00248 }

void CudaComputeNonbonded::gbisP3PatchReady ( PatchID  ,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 250 of file CudaComputeNonbonded.C.

References Compute::gbisP3PatchReady().

00250                                                                 {
00251   CmiLock(lock);
00252   Compute::gbisP3PatchReady(pid, seq);
00253   CmiUnlock(lock);
00254 }

void CudaComputeNonbonded::initialize (  )  [virtual]

Reimplemented from Compute.

Definition at line 607 of file CudaComputeNonbonded.C.

References cudaCheck, ReductionMgr::Object(), Compute::priority(), REDUCTIONS_BASIC, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

00607                                       {
00608   if (patches.size() > 0) {
00609     // Allocate CUDA version of patches
00610     cudaCheck(cudaSetDevice(deviceID));
00611     allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
00612 
00613     allocate_host<VirialEnergy>(&h_virialEnergy, 1);
00614     allocate_device<VirialEnergy>(&d_virialEnergy, 1);
00615 
00616 #if CUDA_VERSION >= 5050
00617     int leastPriority, greatestPriority;
00618     cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
00619     int priority = (doStreaming) ? leastPriority : greatestPriority;
00620     // int priority = greatestPriority;
00621     cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
00622 #else
00623     cudaCheck(cudaStreamCreate(&stream));
00624 #endif
00625     cudaCheck(cudaEventCreate(&forceDoneEvent));
00626 
00627     buildExclusions();
00628 
00629     lock = CmiCreateLock();
00630 
00631     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00632   }  
00633 }

void CudaComputeNonbonded::launchWork (  ) 

Definition at line 965 of file CudaComputeNonbonded.C.

References cudaCheck, ComputeNonbondedUtil::cutoff, Compute::gbisPhase, CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), CudaComputeNonbondedKernel::getPatchReadyQueue(), SubmitReduction::item(), NAMD_bug(), Node::Object(), CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, Flags::savePairlists, Node::simParameters, simParams, and Flags::usePairlists.

Referenced by ComputeMgr::recvLaunchWork().

00965                                       {
00966   if (CkMyPe() != masterPe)
00967     NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
00968 
00969   beforeForceCompute = CkWallTimer();
00970 
00971   cudaCheck(cudaSetDevice(deviceID));
00972   SimParameters *simParams = Node::Object()->simParameters;
00973 
00974   //execute only during GBIS phase 1, or if not using GBIS
00975   if (!simParams->GBISOn || gbisPhase == 1) {
00976 
00977     if ( atomsChanged || computesChanged ) {
00978       // Invalidate pair lists
00979       pairlistsValid = false;
00980       pairlistTolerance = 0.0f;
00981     }
00982 
00983     // Get maximum atom movement and patch tolerance
00984     float maxAtomMovement = 0.0f;
00985     float maxPatchTolerance = 0.0f;
00986     getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
00987     // Update pair-list cutoff
00988     Flags &flags = patches[0].patch->flags;
00989     savePairlists = false;
00990     usePairlists = false;
00991     if ( flags.savePairlists ) {
00992       savePairlists = true;
00993       usePairlists = true;
00994     } else if ( flags.usePairlists ) {
00995       if ( ! pairlistsValid ||
00996            ( 2. * maxAtomMovement > pairlistTolerance ) ) {
00997         reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
00998       } else {
00999         usePairlists = true;
01000       }
01001     }
01002     if ( ! usePairlists ) {
01003       pairlistsValid = false;
01004     }
01005     float plcutoff = cutoff;
01006     if ( savePairlists ) {
01007       pairlistsValid = true;
01008       pairlistTolerance = 2. * maxPatchTolerance;
01009       plcutoff += pairlistTolerance;
01010     }
01011     plcutoff2 = plcutoff * plcutoff;
01012 
01013     // if (atomsChanged)
01014     //   CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01015     //     plcutoff, pairlistTolerance, savePairlists, usePairlists);
01016 
01017   } // if (!simParams->GBISOn || gbisPhase == 1)
01018 
01019   // Calculate PME & VdW forces
01020   if (!simParams->GBISOn || gbisPhase == 1) {
01021     doForce();
01022     if (doStreaming) {
01023       patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
01024       patchReadyQueueLen = tileListKernel.getNumPatches();
01025       patchReadyQueueNext = 0;
01026       // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
01027       int numEmptyPatches = tileListKernel.getNumEmptyPatches();
01028       int* emptyPatches = tileListKernel.getEmptyPatches();
01029       for (int i=0;i < numEmptyPatches;i++) {
01030         patchReadyQueue[i] = emptyPatches[i];
01031       }
01032       if (patchReadyQueueLen != patches.size())
01033         NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
01034     }
01035   }
01036 
01037   // For GBIS phase 1 at pairlist update, we must re-sort tile list
01038   // before calling doGBISphase1().
01039   if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
01040     // In this code path doGBISphase1() is called in forceDone()
01041     forceDoneSetCallback();
01042     return;
01043   }
01044 
01045   // GBIS Phases
01046   if (simParams->GBISOn) {
01047     if (gbisPhase == 1) {
01048       doGBISphase1();
01049     } else if (gbisPhase == 2) {
01050       doGBISphase2(); 
01051     } else if (gbisPhase == 3) {
01052       doGBISphase3(); 
01053     }
01054   }
01055 
01056   // Copy forces to host
01057   if (!simParams->GBISOn || gbisPhase == 3) {
01058     if (!doStreaming) {
01059       copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
01060       if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
01061     }
01062   }
01063 
01064   if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
01065     // For GBIS, energies are ready after phase 2
01066     nonbondedKernel.reduceVirialEnergy(tileListKernel,
01067       atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
01068       d_forces, d_forcesSlow, d_virialEnergy, stream);
01069     copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
01070   }
01071 
01072   // Setup call back
01073   forceDoneSetCallback();
01074 }

void CudaComputeNonbonded::messageEnqueueWork (  ) 

Definition at line 855 of file CudaComputeNonbonded.C.

References WorkDistrib::messageEnqueueWork(), and NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

00855                                               {
00856   if (masterPe != CkMyPe())
00857     NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
00858   WorkDistrib::messageEnqueueWork(this);
00859 }

int CudaComputeNonbonded::noWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 880 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

00880                                  {
00881   // Simply enqueu doWork on masterPe and return "no work"
00882   computeMgr->sendMessageEnqueueWork(masterPe, this);
00883   return 1;
00884 }

void CudaComputeNonbonded::openBoxesOnPe (  ) 

Definition at line 861 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendLaunchWork().

Referenced by ComputeMgr::recvOpenBoxesOnPe().

00861                                          {
00862   if (rankPatches[CkMyRank()].size() == 0)
00863     NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
00864   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00865     openBox(rankPatches[CkMyRank()][i]);
00866   }
00867   bool done = false;
00868   CmiLock(lock);
00869   patchesCounter -= rankPatches[CkMyRank()].size();
00870   if (patchesCounter == 0) {
00871     patchesCounter = getNumPatches();
00872     done = true;
00873   }
00874   CmiUnlock(lock);
00875   if (done) {
00876     computeMgr->sendLaunchWork(masterPe, this);
00877   }
00878 }

void CudaComputeNonbonded::patchReady ( PatchID  ,
int  doneMigration,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 232 of file CudaComputeNonbonded.C.

References NAMD_bug(), and Compute::patchReady().

00232                                                                              {
00233   if (doneMigration) {
00234     int i = findPid(pid);
00235     if (i == -1)
00236       NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
00237     updatePatch(i);
00238   }
00239   CmiLock(lock);
00240   Compute::patchReady(pid, doneMigration, seq);
00241   CmiUnlock(lock);
00242 }

void CudaComputeNonbonded::registerComputePair ( ComputeID  cid,
PatchID pid,
int *  trans 
)

Definition at line 173 of file CudaComputeNonbonded.C.

References PatchMap::center(), PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

00173                                                                                       {
00174   computesChanged = true;
00175   addPatch(pid[0]);
00176   addPatch(pid[1]);
00177   PatchMap* patchMap = PatchMap::Object();
00178   int t1 = trans[0];
00179   int t2 = trans[1];
00180   Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
00181   offset.x += (t1%3-1) - (t2%3-1);
00182   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00183   offset.z += (t1/9-1) - (t2/9-1);
00184   addCompute(cid, pid[0], pid[1], offset);
00185 }

void CudaComputeNonbonded::registerComputeSelf ( ComputeID  cid,
PatchID  pid 
)

Definition at line 163 of file CudaComputeNonbonded.C.

00163                                                                          {
00164   computesChanged = true;
00165   addPatch(pid);
00166   addCompute(cid, pid, pid, 0.);
00167 }

void CudaComputeNonbonded::skipPatchesOnPe (  ) 

Definition at line 680 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

00680                                            {
00681   if (rankPatches[CkMyRank()].size() == 0)
00682     NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
00683   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00684     skipPatch(rankPatches[CkMyRank()][i]);
00685   }
00686   bool done = false;
00687   CmiLock(lock);
00688   patchesCounter -= rankPatches[CkMyRank()].size();
00689   if (patchesCounter == 0) {
00690     patchesCounter = getNumPatches();
00691     done = true;
00692   }
00693   CmiUnlock(lock);
00694   if (done) {
00695     // Reduction must be done on masterPe
00696     computeMgr->sendFinishReductions(masterPe, this);
00697   }
00698 }

void CudaComputeNonbonded::unregisterBoxesOnPe (  ) 

Definition at line 151 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

00151                                                {
00152   if (rankPatches[CkMyRank()].size() == 0)
00153     NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
00154   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00155     unregisterBox(rankPatches[CkMyRank()][i]);
00156   }
00157 }


The documentation for this class was generated from the following files:
Generated on Tue Jun 19 01:17:19 2018 for NAMD by  doxygen 1.4.7