CudaComputeNonbonded Class Reference

#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:

Compute ComputeNonbondedUtil List of all members.

Public Member Functions

 CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 ~CudaComputeNonbonded ()
void registerComputeSelf (ComputeID cid, PatchID pid)
void registerComputePair (ComputeID cid, PatchID *pid, int *trans)
void assignPatches (ComputeMgr *computeMgrIn)
virtual void initialize ()
virtual void atomUpdate ()
virtual int noWork ()
virtual void doWork ()
void launchWork ()
void finishReductions ()
void unregisterBoxesOnPe ()
void assignPatchesOnPe ()
void openBoxesOnPe ()
void skipPatchesOnPe ()
void finishPatchesOnPe ()
void finishPatchOnPe (int i)
void messageEnqueueWork ()
virtual void patchReady (PatchID, int doneMigration, int seq)
virtual void gbisP2PatchReady (PatchID, int seq)
virtual void gbisP3PatchReady (PatchID, int seq)

Classes

struct  ComputeRecord
struct  PatchRecord

Detailed Description

Definition at line 21 of file CudaComputeNonbonded.h.


Constructor & Destructor Documentation

CudaComputeNonbonded::CudaComputeNonbonded ( ComputeID  c,
int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 36 of file CudaComputeNonbonded.C.

References CUDA_DEBUG_EVENT, CUDA_GBIS1_KERNEL, CUDA_GBIS2_KERNEL, CUDA_GBIS3_KERNEL, CUDA_VDW_KERNEL, cudaCheck, Compute::gbisPhase, NAMD_die(), Node::Object(), Node::simParameters, and simParams.

00037                                                               : 
00038 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
00039 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
00040 
00041   cudaCheck(cudaSetDevice(deviceID));
00042 
00043         exclusionsByAtom = NULL;
00044 
00045   vdwTypes = NULL;
00046   vdwTypesSize = 0;
00047 
00048   exclIndexMaxDiff = NULL;
00049   exclIndexMaxDiffSize = 0;
00050 
00051   atomIndex = NULL;
00052   atomIndexSize = 0;
00053 
00054   atomStorageSize = 0;
00055 
00056   // Atom and charge storage
00057   atoms = NULL;
00058   atomsSize = 0;
00059 
00060   // Force storage
00061   h_forces = NULL;
00062   h_forcesSize = 0;
00063   h_forcesSlow = NULL;
00064   h_forcesSlowSize = 0;
00065 
00066   d_forces = NULL;
00067   d_forcesSize = 0;
00068   d_forcesSlow = NULL;
00069   d_forcesSlowSize = 0;
00070 
00071   // GBIS
00072   intRad0H = NULL;
00073   intRad0HSize = 0;
00074   intRadSH = NULL;
00075   intRadSHSize = 0;
00076   psiSumH = NULL;
00077   psiSumHSize = 0;
00078   bornRadH = NULL;
00079   bornRadHSize = 0;
00080   dEdaSumH = NULL;
00081   dEdaSumHSize = 0;
00082   dHdrPrefixH = NULL;
00083   dHdrPrefixHSize = 0;
00084 
00085   cudaPatches = NULL;
00086 
00087   atomsChangedIn = true;
00088   atomsChanged = true;
00089   computesChanged = true;
00090 
00091   forceDoneEventRecord = false;
00092 
00093   SimParameters *simParams = Node::Object()->simParameters;
00094   if (simParams->pressureProfileOn) {
00095     NAMD_die("CudaComputeNonbonded, pressure profile not supported");
00096   }
00097 
00098   if (simParams->GBISOn) gbisPhase = 3;
00099 
00100   doSkip = false;
00101 
00102 #define CUDA_DEBUG_EVENT 171
00103   traceRegisterUserEvent("CUDA DEBUG", CUDA_DEBUG_EVENT);
00104 #define CUDA_VDW_KERNEL 172
00105   traceRegisterUserEvent("CUDA VdW kernel", CUDA_VDW_KERNEL);
00106 #define CUDA_GBIS1_KERNEL 173
00107   traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL);
00108 #define CUDA_GBIS2_KERNEL 174
00109   traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL);
00110 #define CUDA_GBIS3_KERNEL 175
00111   traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL);
00112 }

CudaComputeNonbonded::~CudaComputeNonbonded (  ) 

Definition at line 117 of file CudaComputeNonbonded.C.

References cudaCheck, and ComputeMgr::sendUnregisterBoxesOnPe().

00117                                             {
00118   cudaCheck(cudaSetDevice(deviceID));
00119         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
00120   if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
00121   if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
00122   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
00123   if (h_forces != NULL) deallocate_host<float4>(&h_forces);
00124   if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
00125   if (d_forces != NULL) deallocate_device<float4>(&d_forces);
00126   if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
00127 
00128   // GBIS
00129   if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
00130   if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
00131   if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
00132   if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
00133   if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
00134   if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
00135 
00136   if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
00137 
00138   if (patches.size() > 0) {
00139     deallocate_host<VirialEnergy>(&h_virialEnergy);
00140     deallocate_device<VirialEnergy>(&d_virialEnergy);
00141     cudaCheck(cudaStreamDestroy(stream));
00142     cudaCheck(cudaEventDestroy(forceDoneEvent));
00143     CmiDestroyLock(lock);
00144     delete reduction;
00145   }
00146 
00147   // NOTE: unregistering happens in [sync] -entry method
00148   computeMgr->sendUnregisterBoxesOnPe(pes, this);
00149 
00150 }


Member Function Documentation

void CudaComputeNonbonded::assignPatches ( ComputeMgr computeMgrIn  ) 

Definition at line 374 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), j, NAMD_bug(), ComputePmeCUDAMgr::Object(), PatchMap::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), and Compute::setNumPatches().

Referenced by ComputeMgr::createComputes().

00374                                                                  {
00375   // Remove duplicate patches
00376   std::sort(patches.begin(), patches.end());
00377   std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
00378   patches.erase(last, patches.end());
00379   // Set number of patches
00380   setNumPatches(patches.size());
00381   masterPe = CkMyPe();
00382   computeMgr = computeMgrIn;
00383   // Start patch counter
00384   patchesCounter = getNumPatches();
00385   // Patch ID map
00386   std::map<PatchID, int> pidMap;
00387 #if 1
00388   //-------------------------------------------------------
00389   // Copied in from ComputeNonbondedCUDA::assignPatches()
00390   //-------------------------------------------------------
00391 
00392   std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
00393   int numPesOnNodeSharingDevice = 0;
00394   int masterIndex = -1;
00395   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00396     int pe = deviceCUDA->getPesSharingDevice(i);
00397     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00398     if ( CkNodeOf(pe) == CkMyNode() ) {
00399       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00400     }
00401   }
00402 
00403   std::vector<int> count(patches.size(), 0);
00404   std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
00405   std::vector<int> rankpcount(CkMyNodeSize(), 0);
00406   std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
00407 
00408   PatchMap* patchMap = PatchMap::Object();
00409 
00410   int unassignedpatches = patches.size();
00411 
00412   for (int i=0;i < patches.size(); ++i) {
00413     patches[i].pe = -1;
00414   }
00415 
00416   // assign if home pe and build table of natural proxies
00417   for (int i=0;i < patches.size(); ++i) {
00418     int pid = patches[i].patchID;
00419     // homePe = PE where the patch currently resides
00420     int homePe = patchMap->node(pid);
00421     for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
00422       int pe = pesOnNodeSharingDevice[j];
00423       // If homePe is sharing this device, assign this patch to homePe
00424       if ( pe == homePe ) {
00425         patches[i].pe = pe;
00426         --unassignedpatches;
00427         pcount[j] += 1;
00428       }
00429       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00430         table[i*numPesOnNodeSharingDevice + j] = 1;
00431       }
00432     }
00433     // Assign this patch to homePe, if it resides on the same node
00434     if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00435       patches[i].pe = homePe;
00436       --unassignedpatches;
00437       rankpcount[CkRankOf(homePe)] += 1;
00438     }
00439   }
00440   // assign if only one pe has a required proxy
00441   for (int i=0; i < patches.size(); ++i) {
00442     int pid = patches[i].patchID;
00443     if ( patches[i].pe != -1 ) continue;
00444     int c = 0;
00445     int lastj;
00446     for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
00447       if ( table[i*numPesOnNodeSharingDevice + j] ) {
00448         ++c;
00449         lastj = j;
00450       }
00451     }
00452     count[i] = c;
00453     if ( c == 1 ) {
00454       patches[i].pe = pesOnNodeSharingDevice[lastj];
00455       --unassignedpatches;
00456       pcount[lastj] += 1;
00457     }
00458   }
00459   int assignj = 0;
00460   while ( unassignedpatches ) {
00461     int i;
00462     for (i=0;i < patches.size(); ++i) {
00463       if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
00464       int pid = patches[i].patchID;
00465       // patch_record &pr = patchRecords[pid];
00466       if ( patches[i].pe != -1 ) continue;
00467       patches[i].pe = pesOnNodeSharingDevice[assignj];
00468       --unassignedpatches;
00469       pcount[assignj] += 1;
00470       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00471       break;
00472     }
00473     if (i < patches.size() ) continue;  // start search again
00474     for ( i=0;i < patches.size(); ++i ) {
00475       int pid = patches[i].patchID;
00476       // patch_record &pr = patchRecords[pid];
00477       if ( patches[i].pe != -1 ) continue;
00478       if ( count[i] ) continue;
00479       patches[i].pe = pesOnNodeSharingDevice[assignj];
00480       --unassignedpatches;
00481       pcount[assignj] += 1;
00482       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00483       break;
00484     }
00485     if ( i < patches.size() ) continue;  // start search again
00486     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00487   }
00488 
00489   // For each rank, list of patches
00490   rankPatches.resize(CkMyNodeSize());
00491   for (int i=0; i < patches.size(); ++i) {
00492     rankPatches[CkRankOf(patches[i].pe)].push_back(i);
00493     pidMap[patches[i].patchID] = i;
00494   }
00495 
00496   // for ( int i=0; i < patches.size(); ++i ) {
00497   //   CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
00498   // }
00499 
00500 /*
00501   slavePes = new int[CkMyNodeSize()];
00502   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00503   numSlaves = 0;
00504   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00505     int pe = pesOnNodeSharingDevice[j];
00506     int rank = pe - CkNodeFirst(CkMyNode());
00507     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00508     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00509     if ( pe == CkMyPe() ) continue;
00510     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00511     rankpcount[rank] = 0;  // skip in rank loop below
00512     slavePes[numSlaves] = pe;
00513     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00514     ++numSlaves;
00515   }
00516   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00517     int pe = CkNodeFirst(CkMyNode()) + j;
00518     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00519     //          CkMyPe(),j,pe,rankpcount[j]);
00520     if ( ! rankpcount[j] ) continue;
00521     if ( pe == CkMyPe() ) continue;
00522     slavePes[numSlaves] = pe;
00523     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00524     ++numSlaves;
00525   }
00526 */
00527 
00528 #else
00529   // For each rank, list of patches
00530   rankPatches.resize(CkMyNodeSize());
00531   // For each rank, list of home patch IDs
00532   PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
00533   for (int i=0;i < CkMyNodeSize();i++) {
00534     int pe = CkNodeFirst(CkMyNode()) + i;
00535     PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
00536   }
00537   std::vector<int> proxyPatchPes;
00538   std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
00539   //--------------------------------------------------------
00540   // Build a list of PEs to avoid
00541   std::vector<int> pesToAvoid;
00542 #if 0
00543   // Avoid other GPUs' master PEs
00544   for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
00545     int pe = deviceCUDA->getMasterPeForDeviceID(i);
00546     if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
00547   }
00548   // Avoid PEs that are involved in PME
00549   ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
00550   for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
00551     if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
00552   }
00553   // Set counters of avoidable PEs to high numbers
00554   for (int i=0;i < pesToAvoid.size();i++) {
00555     int pe = pesToAvoid[i];
00556     peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);    
00557   }
00558 #endif
00559   // Avoid master Pe somewhat
00560   peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
00561   //--------------------------------------------------------
00562   for (int i=0;i < patches.size();i++) {
00563     PatchID pid = patches[i].patchID;
00564     int pe = findHomePatchPe(rankHomePatchIDs, pid);
00565     if (pe == -1) {
00566       // Patch not present on this node => try finding a ProxyPatch
00567       findProxyPatchPes(proxyPatchPes, pid);
00568       if (proxyPatchPes.size() == 0) {
00569         // No ProxyPatch => create one on rank that has the least ProxyPatches
00570         int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00571         pe = CkNodeFirst(CkMyNode()) + rank;
00572         peProxyPatchCounter[rank]++;
00573       } else {
00574         // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
00575         // this is done by finding the entry with minimum peProxyPatchCounter -value
00576         // Find miniumum among proxyPatchPes, i.e., find the minimum among
00577         // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
00578         // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
00579         //   [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
00580         //   - proxyPatchPes.begin();
00581         // pe = proxyPatchPes[pppi];
00582         int minCounter = (1 << 30);
00583         for (int j=0;j < proxyPatchPes.size();j++) {
00584           if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
00585             pe = proxyPatchPes[j];
00586             minCounter = peProxyPatchCounter[CkRankOf(pe)];
00587           }
00588         }
00589         if (pe == -1)
00590           NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
00591         peProxyPatchCounter[CkRankOf(pe)]++;
00592       }
00593     } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
00594       // Found home patch on this node, but it's on PE that should be avoided => find a new one
00595       int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00596       pe = CkNodeFirst(CkMyNode()) + rank;
00597       peProxyPatchCounter[rank]++;
00598     }
00599     if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
00600       NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
00601     rankPatches[CkRankOf(pe)].push_back(i);
00602     pidMap[pid] = i;
00603   }
00604 
00605   delete [] rankHomePatchIDs;
00606 #endif
00607   // Setup computes using pidMap
00608   for (int i=0;i < computes.size();i++) {
00609     computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
00610     computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
00611   }
00612   for (int i=0;i < CkMyNodeSize();i++) {
00613     if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
00614   }
00615   computeMgr->sendAssignPatchesOnPe(pes, this);
00616 }

void CudaComputeNonbonded::assignPatchesOnPe (  ) 

Definition at line 312 of file CudaComputeNonbonded.C.

References ResizeArray< Elem >::add(), j, NAMD_bug(), PatchMap::node(), PatchMap::Object(), ResizeArray< Elem >::size(), and y.

Referenced by ComputeMgr::recvAssignPatchesOnPe().

00312                                              {
00313   if (rankPatches[CkMyRank()].size() == 0)
00314     NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
00315 
00316   // calculate priority rank of local home patch within pe
00317   {
00318     PatchMap* patchMap = PatchMap::Object();
00319     ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
00320     for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
00321       int i = rankPatches[CkMyRank()][k];
00322       int pid = patches[i].patchID;
00323       int homePe = patchMap->node(pid);
00324       if ( CkNodeOf(homePe) == CkMyNode() ) {
00325         int2 pid_index;
00326         pid_index.x = pid;
00327         pid_index.y = i;
00328         homePatchByRank[CkRankOf(homePe)].add(pid_index);
00329       }
00330     }
00331     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00332       pid_sortop_reverse_priority so;
00333       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00334       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00335       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00336         int index = homePatchByRank[i][j].y;
00337         patches[index].reversePriorityRankInPe = j + masterBoost;
00338       }
00339     }
00340   }
00341 
00342   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00343     assignPatch(rankPatches[CkMyRank()][i]);
00344   }
00345 }

void CudaComputeNonbonded::atomUpdate (  )  [virtual]

Reimplemented from Compute.

Definition at line 649 of file CudaComputeNonbonded.C.

00649                                       {
00650   atomsChangedIn = true;
00651 }

void CudaComputeNonbonded::doWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 925 of file CudaComputeNonbonded.C.

References Flags::doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, Flags::doVirial, Compute::gbisPhase, NAMD_bug(), Node::Object(), ComputeMgr::sendOpenBoxesOnPe(), Node::simParameters, and simParams.

00925                                   {
00926   if (CkMyPe() != masterPe)
00927     NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
00928 
00929   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
00930   // atomsChangedIn can be set to true by any Pe
00931   // atomsChanged can only be set by masterPe
00932   // This use of double varibles makes sure we don't have race condition
00933   atomsChanged = atomsChangedIn;
00934   atomsChangedIn = false;
00935 
00936   SimParameters *simParams = Node::Object()->simParameters;
00937 
00938   if (patches.size() == 0) return;  // No work do to
00939 
00940   // Take the flags from the first patch on this Pe
00941   // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
00942   Flags &flags = patches[0].patch->flags;
00943 
00944   doSlow = flags.doFullElectrostatics;
00945   doEnergy = flags.doEnergy;
00946   doVirial = flags.doVirial;
00947 
00948   if (flags.doNonbonded) {
00949 
00950     if (simParams->GBISOn) {
00951       gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
00952     }
00953 
00954     if (!simParams->GBISOn || gbisPhase == 1) {
00955       if ( computesChanged ) {
00956         updateComputes();
00957       }
00958       if (atomsChanged) {
00959         // Re-calculate patch atom numbers and storage
00960         updatePatches();
00961         reSortDone = false;
00962       }
00963       reallocateArrays();
00964     }
00965 
00966     // Open boxes on Pes and launch work to masterPe
00967     computeMgr->sendOpenBoxesOnPe(pes, this);
00968 
00969   } else {
00970     // No work to do, skip
00971     skip();
00972   }
00973 
00974 }

void CudaComputeNonbonded::finishPatchesOnPe (  ) 

Definition at line 1371 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchesOnPe().

01371                                              {
01372   finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
01373 }

void CudaComputeNonbonded::finishPatchOnPe ( int  i  ) 

Definition at line 1378 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

01378                                                 {
01379   std::vector<int> v(1, i);
01380   finishSetOfPatchesOnPe(v);
01381 }

void CudaComputeNonbonded::finishReductions (  ) 

Definition at line 1212 of file CudaComputeNonbonded.C.

References ADD_TENSOR_OBJECT, cudaCheck, VirialEnergy::energyElec, VirialEnergy::energyGBIS, VirialEnergy::energySlow, VirialEnergy::energyVdw, CudaTileListKernel::getNumExcluded(), SubmitReduction::item(), NAMD_bug(), Node::Object(), REDUCTION_COMPUTE_CHECKSUM, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, REDUCTION_VIRIAL_NBOND, REDUCTION_VIRIAL_SLOW, Node::simParameters, simParams, SubmitReduction::submit(), VirialEnergy::virial, VirialEnergy::virialSlow, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by ComputeMgr::recvFinishReductions().

01212                                             {
01213 
01214   if (CkMyPe() != masterPe)
01215     NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
01216   
01217   // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
01218 
01219   if (!doSkip) {
01220 
01221     if (doStreaming && (doVirial || doEnergy)) {
01222       // For streaming kernels, we must wait for virials and forces to be copied back to CPU
01223       if (!forceDoneEventRecord)
01224         NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
01225       cudaCheck(cudaEventSynchronize(forceDoneEvent));
01226       forceDoneEventRecord = false;
01227     }
01228 
01229     if (doVirial) {
01230       Tensor virialTensor;
01231       virialTensor.xx = h_virialEnergy->virial[0];
01232       virialTensor.xy = h_virialEnergy->virial[1];
01233       virialTensor.xz = h_virialEnergy->virial[2];
01234       virialTensor.yx = h_virialEnergy->virial[3];
01235       virialTensor.yy = h_virialEnergy->virial[4];
01236       virialTensor.yz = h_virialEnergy->virial[5];
01237       virialTensor.zx = h_virialEnergy->virial[6];
01238       virialTensor.zy = h_virialEnergy->virial[7];
01239       virialTensor.zz = h_virialEnergy->virial[8];
01240       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01241       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01242       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01243       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
01244       if (doSlow) {
01245         Tensor virialTensor;
01246         virialTensor.xx = h_virialEnergy->virialSlow[0];
01247         virialTensor.xy = h_virialEnergy->virialSlow[1];
01248         virialTensor.xz = h_virialEnergy->virialSlow[2];
01249         virialTensor.yx = h_virialEnergy->virialSlow[3];
01250         virialTensor.yy = h_virialEnergy->virialSlow[4];
01251         virialTensor.yz = h_virialEnergy->virialSlow[5];
01252         virialTensor.zx = h_virialEnergy->virialSlow[6];
01253         virialTensor.zy = h_virialEnergy->virialSlow[7];
01254         virialTensor.zz = h_virialEnergy->virialSlow[8];
01255         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01256         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01257         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01258         ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
01259       }
01260     }
01261     if (doEnergy) {
01262       // if (doSlow)
01263       //   printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
01264       SimParameters *simParams = Node::Object()->simParameters;
01265       reduction->item(REDUCTION_LJ_ENERGY)    += h_virialEnergy->energyVdw;
01266       reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
01267       // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
01268       if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
01269       // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
01270     }
01271 
01272     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
01273   }
01274   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
01275   reduction->submit();
01276 
01277   // Reset flags
01278   doSkip = false;
01279   computesChanged = false;
01280 }

void CudaComputeNonbonded::gbisP2PatchReady ( PatchID  ,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 255 of file CudaComputeNonbonded.C.

References Compute::gbisP2PatchReady().

00255                                                                 {
00256   CmiLock(lock);
00257   Compute::gbisP2PatchReady(pid, seq);
00258   CmiUnlock(lock);
00259 }

void CudaComputeNonbonded::gbisP3PatchReady ( PatchID  ,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 261 of file CudaComputeNonbonded.C.

References Compute::gbisP3PatchReady().

00261                                                                 {
00262   CmiLock(lock);
00263   Compute::gbisP3PatchReady(pid, seq);
00264   CmiUnlock(lock);
00265 }

void CudaComputeNonbonded::initialize (  )  [virtual]

Reimplemented from Compute.

Definition at line 618 of file CudaComputeNonbonded.C.

References cudaCheck, ReductionMgr::Object(), Compute::priority(), REDUCTIONS_BASIC, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

00618                                       {
00619   if (patches.size() > 0) {
00620     // Allocate CUDA version of patches
00621     cudaCheck(cudaSetDevice(deviceID));
00622     allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
00623 
00624     allocate_host<VirialEnergy>(&h_virialEnergy, 1);
00625     allocate_device<VirialEnergy>(&d_virialEnergy, 1);
00626 
00627 #if CUDA_VERSION >= 5050
00628     int leastPriority, greatestPriority;
00629     cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
00630     int priority = (doStreaming) ? leastPriority : greatestPriority;
00631     // int priority = greatestPriority;
00632     cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
00633 #else
00634     cudaCheck(cudaStreamCreate(&stream));
00635 #endif
00636     cudaCheck(cudaEventCreate(&forceDoneEvent));
00637 
00638     buildExclusions();
00639 
00640     lock = CmiCreateLock();
00641 
00642     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00643   }  
00644 }

void CudaComputeNonbonded::launchWork (  ) 

Definition at line 976 of file CudaComputeNonbonded.C.

References cudaCheck, ComputeNonbondedUtil::cutoff, Compute::gbisPhase, CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), CudaComputeNonbondedKernel::getPatchReadyQueue(), SubmitReduction::item(), NAMD_bug(), Node::Object(), CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, Flags::savePairlists, Node::simParameters, simParams, and Flags::usePairlists.

Referenced by ComputeMgr::recvLaunchWork().

00976                                       {
00977   if (CkMyPe() != masterPe)
00978     NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
00979 
00980   beforeForceCompute = CkWallTimer();
00981 
00982   cudaCheck(cudaSetDevice(deviceID));
00983   SimParameters *simParams = Node::Object()->simParameters;
00984 
00985   //execute only during GBIS phase 1, or if not using GBIS
00986   if (!simParams->GBISOn || gbisPhase == 1) {
00987 
00988     if ( atomsChanged || computesChanged ) {
00989       // Invalidate pair lists
00990       pairlistsValid = false;
00991       pairlistTolerance = 0.0f;
00992     }
00993 
00994     // Get maximum atom movement and patch tolerance
00995     float maxAtomMovement = 0.0f;
00996     float maxPatchTolerance = 0.0f;
00997     getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
00998     // Update pair-list cutoff
00999     Flags &flags = patches[0].patch->flags;
01000     savePairlists = false;
01001     usePairlists = false;
01002     if ( flags.savePairlists ) {
01003       savePairlists = true;
01004       usePairlists = true;
01005     } else if ( flags.usePairlists ) {
01006       if ( ! pairlistsValid ||
01007            ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01008         reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01009       } else {
01010         usePairlists = true;
01011       }
01012     }
01013     if ( ! usePairlists ) {
01014       pairlistsValid = false;
01015     }
01016     float plcutoff = cutoff;
01017     if ( savePairlists ) {
01018       pairlistsValid = true;
01019       pairlistTolerance = 2. * maxPatchTolerance;
01020       plcutoff += pairlistTolerance;
01021     }
01022     plcutoff2 = plcutoff * plcutoff;
01023 
01024     // if (atomsChanged)
01025     //   CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01026     //     plcutoff, pairlistTolerance, savePairlists, usePairlists);
01027 
01028   } // if (!simParams->GBISOn || gbisPhase == 1)
01029 
01030   // Calculate PME & VdW forces
01031   if (!simParams->GBISOn || gbisPhase == 1) {
01032     doForce();
01033     if (doStreaming) {
01034       patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
01035       patchReadyQueueLen = tileListKernel.getNumPatches();
01036       patchReadyQueueNext = 0;
01037       // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
01038       int numEmptyPatches = tileListKernel.getNumEmptyPatches();
01039       int* emptyPatches = tileListKernel.getEmptyPatches();
01040       for (int i=0;i < numEmptyPatches;i++) {
01041         patchReadyQueue[i] = emptyPatches[i];
01042       }
01043       if (patchReadyQueueLen != patches.size())
01044         NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
01045     }
01046   }
01047 
01048   // For GBIS phase 1 at pairlist update, we must re-sort tile list
01049   // before calling doGBISphase1().
01050   if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
01051     // In this code path doGBISphase1() is called in forceDone()
01052     forceDoneSetCallback();
01053     return;
01054   }
01055 
01056   // GBIS Phases
01057   if (simParams->GBISOn) {
01058     if (gbisPhase == 1) {
01059       doGBISphase1();
01060     } else if (gbisPhase == 2) {
01061       doGBISphase2(); 
01062     } else if (gbisPhase == 3) {
01063       doGBISphase3(); 
01064     }
01065   }
01066 
01067   // Copy forces to host
01068   if (!simParams->GBISOn || gbisPhase == 3) {
01069     if (!doStreaming) {
01070       copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
01071       if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
01072     }
01073   }
01074 
01075   if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
01076     // For GBIS, energies are ready after phase 2
01077     nonbondedKernel.reduceVirialEnergy(tileListKernel,
01078       atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
01079       d_forces, d_forcesSlow, d_virialEnergy, stream);
01080     copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
01081   }
01082 
01083   // Setup call back
01084   forceDoneSetCallback();
01085 }

void CudaComputeNonbonded::messageEnqueueWork (  ) 

Definition at line 866 of file CudaComputeNonbonded.C.

References WorkDistrib::messageEnqueueWork(), and NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

00866                                               {
00867   if (masterPe != CkMyPe())
00868     NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
00869   WorkDistrib::messageEnqueueWork(this);
00870 }

int CudaComputeNonbonded::noWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 891 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

00891                                  {
00892   // Simply enqueu doWork on masterPe and return "no work"
00893   computeMgr->sendMessageEnqueueWork(masterPe, this);
00894   return 1;
00895 }

void CudaComputeNonbonded::openBoxesOnPe (  ) 

Definition at line 872 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendLaunchWork().

Referenced by ComputeMgr::recvOpenBoxesOnPe().

00872                                          {
00873   if (rankPatches[CkMyRank()].size() == 0)
00874     NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
00875   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00876     openBox(rankPatches[CkMyRank()][i]);
00877   }
00878   bool done = false;
00879   CmiLock(lock);
00880   patchesCounter -= rankPatches[CkMyRank()].size();
00881   if (patchesCounter == 0) {
00882     patchesCounter = getNumPatches();
00883     done = true;
00884   }
00885   CmiUnlock(lock);
00886   if (done) {
00887     computeMgr->sendLaunchWork(masterPe, this);
00888   }
00889 }

void CudaComputeNonbonded::patchReady ( PatchID  ,
int  doneMigration,
int  seq 
) [virtual]

Reimplemented from Compute.

Definition at line 243 of file CudaComputeNonbonded.C.

References NAMD_bug(), and Compute::patchReady().

00243                                                                              {
00244   if (doneMigration) {
00245     int i = findPid(pid);
00246     if (i == -1)
00247       NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
00248     updatePatch(i);
00249   }
00250   CmiLock(lock);
00251   Compute::patchReady(pid, doneMigration, seq);
00252   CmiUnlock(lock);
00253 }

void CudaComputeNonbonded::registerComputePair ( ComputeID  cid,
PatchID pid,
int *  trans 
)

Definition at line 184 of file CudaComputeNonbonded.C.

References PatchMap::center(), PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

00184                                                                                       {
00185   computesChanged = true;
00186   addPatch(pid[0]);
00187   addPatch(pid[1]);
00188   PatchMap* patchMap = PatchMap::Object();
00189   int t1 = trans[0];
00190   int t2 = trans[1];
00191   Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
00192   offset.x += (t1%3-1) - (t2%3-1);
00193   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00194   offset.z += (t1/9-1) - (t2/9-1);
00195   addCompute(cid, pid[0], pid[1], offset);
00196 }

void CudaComputeNonbonded::registerComputeSelf ( ComputeID  cid,
PatchID  pid 
)

Definition at line 174 of file CudaComputeNonbonded.C.

00174                                                                          {
00175   computesChanged = true;
00176   addPatch(pid);
00177   addCompute(cid, pid, pid, 0.);
00178 }

void CudaComputeNonbonded::skipPatchesOnPe (  ) 

Definition at line 691 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

00691                                            {
00692   if (rankPatches[CkMyRank()].size() == 0)
00693     NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
00694   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00695     skipPatch(rankPatches[CkMyRank()][i]);
00696   }
00697   bool done = false;
00698   CmiLock(lock);
00699   patchesCounter -= rankPatches[CkMyRank()].size();
00700   if (patchesCounter == 0) {
00701     patchesCounter = getNumPatches();
00702     done = true;
00703   }
00704   CmiUnlock(lock);
00705   if (done) {
00706     // Reduction must be done on masterPe
00707     computeMgr->sendFinishReductions(masterPe, this);
00708   }
00709 }

void CudaComputeNonbonded::unregisterBoxesOnPe (  ) 

Definition at line 162 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

00162                                                {
00163   if (rankPatches[CkMyRank()].size() == 0)
00164     NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
00165   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00166     unregisterBox(rankPatches[CkMyRank()][i]);
00167   }
00168 }


The documentation for this class was generated from the following files:
Generated on Mon Sep 25 01:17:17 2017 for NAMD by  doxygen 1.4.7