CudaComputeNonbonded.C

Go to the documentation of this file.
00001 #include <algorithm>
00002 #include <map>
00003 #include <vector>
00004 #include "NamdTypes.h"
00005 #include "charm++.h"
00006 #include "Patch.h"
00007 #include "PatchMap.h"
00008 #include "ProxyMgr.h"
00009 #include "LJTable.h"
00010 #include "Node.h"
00011 #include "ObjectArena.h"
00012 // #include "ComputeCUDAMgr.h"
00013 #include "ReductionMgr.h"
00014 #include "CudaComputeNonbonded.h"
00015 #include "WorkDistrib.h"
00016 #include "HomePatch.h"
00017 #include "Priorities.h"
00018 #include "ComputePmeCUDAMgr.h"
00019 //#include "CudaUtils.h"
00020 
00021 #include "DeviceCUDA.h"
00022 #ifdef NAMD_CUDA
00023 #ifdef WIN32
00024 #define __thread __declspec(thread)
00025 #endif
00026 extern __thread DeviceCUDA *deviceCUDA;
00027 #endif
00028 
00029 #ifdef NAMD_CUDA
00030 
00031 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime);  // fix Charm++
00032 
00033 //
00034 // Class constructor
00035 //
00036 CudaComputeNonbonded::CudaComputeNonbonded(ComputeID c, int deviceID,
00037   CudaNonbondedTables& cudaNonbondedTables, bool doStreaming) : 
00038 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
00039 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
00040 
00041   cudaCheck(cudaSetDevice(deviceID));
00042 
00043         exclusionsByAtom = NULL;
00044 
00045   vdwTypes = NULL;
00046   vdwTypesSize = 0;
00047 
00048   exclIndexMaxDiff = NULL;
00049   exclIndexMaxDiffSize = 0;
00050 
00051   atomIndex = NULL;
00052   atomIndexSize = 0;
00053 
00054   atomStorageSize = 0;
00055 
00056   // Atom and charge storage
00057   atoms = NULL;
00058   atomsSize = 0;
00059 
00060   // Force storage
00061   h_forces = NULL;
00062   h_forcesSize = 0;
00063   h_forcesSlow = NULL;
00064   h_forcesSlowSize = 0;
00065 
00066   d_forces = NULL;
00067   d_forcesSize = 0;
00068   d_forcesSlow = NULL;
00069   d_forcesSlowSize = 0;
00070 
00071   // GBIS
00072   intRad0H = NULL;
00073   intRad0HSize = 0;
00074   intRadSH = NULL;
00075   intRadSHSize = 0;
00076   psiSumH = NULL;
00077   psiSumHSize = 0;
00078   bornRadH = NULL;
00079   bornRadHSize = 0;
00080   dEdaSumH = NULL;
00081   dEdaSumHSize = 0;
00082   dHdrPrefixH = NULL;
00083   dHdrPrefixHSize = 0;
00084 
00085   cudaPatches = NULL;
00086 
00087   atomsChangedIn = true;
00088   atomsChanged = true;
00089   computesChanged = true;
00090 
00091   forceDoneEventRecord = false;
00092 
00093   SimParameters *simParams = Node::Object()->simParameters;
00094   if (simParams->pressureProfileOn) {
00095     NAMD_die("CudaComputeNonbonded, pressure profile not supported");
00096   }
00097 
00098   if (simParams->GBISOn) gbisPhase = 3;
00099 
00100   doSkip = false;
00101 
00102 #define CUDA_DEBUG_EVENT 171
00103   traceRegisterUserEvent("CUDA DEBUG", CUDA_DEBUG_EVENT);
00104 #define CUDA_VDW_KERNEL 172
00105   traceRegisterUserEvent("CUDA VdW kernel", CUDA_VDW_KERNEL);
00106 #define CUDA_GBIS1_KERNEL 173
00107   traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL);
00108 #define CUDA_GBIS2_KERNEL 174
00109   traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL);
00110 #define CUDA_GBIS3_KERNEL 175
00111   traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL);
00112 }
00113 
00114 //
00115 // Class destructor
00116 //
00117 CudaComputeNonbonded::~CudaComputeNonbonded() {
00118   cudaCheck(cudaSetDevice(deviceID));
00119         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
00120   if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
00121   if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
00122   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
00123   if (h_forces != NULL) deallocate_host<float4>(&h_forces);
00124   if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
00125   if (d_forces != NULL) deallocate_device<float4>(&d_forces);
00126   if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
00127 
00128   // GBIS
00129   if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
00130   if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
00131   if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
00132   if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
00133   if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
00134   if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
00135 
00136   if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
00137 
00138   if (patches.size() > 0) {
00139     deallocate_host<VirialEnergy>(&h_virialEnergy);
00140     deallocate_device<VirialEnergy>(&d_virialEnergy);
00141     cudaCheck(cudaStreamDestroy(stream));
00142     cudaCheck(cudaEventDestroy(forceDoneEvent));
00143     CmiDestroyLock(lock);
00144     delete reduction;
00145   }
00146 
00147   // NOTE: unregistering happens in [sync] -entry method
00148   computeMgr->sendUnregisterBoxesOnPe(pes, this);
00149 
00150 }
00151 
00152 void CudaComputeNonbonded::unregisterBox(int i) {
00153   if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(this, &patches[i].positionBox);
00154   if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(this, &patches[i].forceBox);
00155   if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(this, &patches[i].intRadBox);
00156   if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(this, &patches[i].psiSumBox);
00157   if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(this, &patches[i].bornRadBox);
00158   if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(this, &patches[i].dEdaSumBox);
00159   if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(this, &patches[i].dHdrPrefixBox);
00160 }
00161 
00162 void CudaComputeNonbonded::unregisterBoxesOnPe() {
00163   if (rankPatches[CkMyRank()].size() == 0)
00164     NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
00165   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00166     unregisterBox(rankPatches[CkMyRank()][i]);
00167   }
00168 }
00169 
00170 //
00171 // Register inter-patch (self) compute.
00172 // Only serialized calls allowed
00173 //
00174 void CudaComputeNonbonded::registerComputeSelf(ComputeID cid, PatchID pid) {
00175   computesChanged = true;
00176   addPatch(pid);
00177   addCompute(cid, pid, pid, 0.);
00178 }
00179 
00180 //
00181 // Register pair-patch compute.
00182 // Only serialized calls allowed
00183 //
00184 void CudaComputeNonbonded::registerComputePair(ComputeID cid, PatchID* pid, int* trans) {
00185   computesChanged = true;
00186   addPatch(pid[0]);
00187   addPatch(pid[1]);
00188   PatchMap* patchMap = PatchMap::Object();
00189   int t1 = trans[0];
00190   int t2 = trans[1];
00191   Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
00192   offset.x += (t1%3-1) - (t2%3-1);
00193   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00194   offset.z += (t1/9-1) - (t2/9-1);
00195   addCompute(cid, pid[0], pid[1], offset);
00196 }
00197 
00198 //
00199 // Add patch
00200 //
00201 void CudaComputeNonbonded::addPatch(PatchID pid) {
00202   patches.push_back(PatchRecord(pid));
00203 }
00204 
00205 //
00206 // Add compute
00207 //
00208 void CudaComputeNonbonded::addCompute(ComputeID cid, PatchID pid1, PatchID pid2, Vector offset) {
00209   ComputeRecord cr;
00210   cr.cid = cid;
00211   cr.pid[0] = pid1;
00212   cr.pid[1] = pid2;
00213   cr.offset = offset;
00214   computes.push_back(cr);
00215 }
00216 
00217 //
00218 // Update numAtoms and numFreeAtoms on a patch
00219 //
00220 void CudaComputeNonbonded::updatePatch(int i) {
00221   int numAtoms = patches[i].patch->getNumAtoms();
00222   int numFreeAtoms = numAtoms;
00223   if ( fixedAtomsOn ) {
00224     const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
00225     for ( int j=0; j< numAtoms; ++j ) {
00226       if ( aExt[j].atomFixed ) --numFreeAtoms;
00227     }
00228   }
00229   patches[i].numAtoms = numAtoms;
00230   patches[i].numFreeAtoms = numFreeAtoms;
00231   cudaPatches[i].numAtoms = numAtoms;
00232   cudaPatches[i].numFreeAtoms = numFreeAtoms;
00233 }
00234 
00235 int CudaComputeNonbonded::findPid(PatchID pid) {
00236   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00237     int j = rankPatches[CkMyRank()][i];
00238     if (patches[j].patchID == pid) return j;
00239   }
00240   return -1;
00241 }
00242 
00243 void CudaComputeNonbonded::patchReady(PatchID pid, int doneMigration, int seq) {
00244   if (doneMigration) {
00245     int i = findPid(pid);
00246     if (i == -1)
00247       NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
00248     updatePatch(i);
00249   }
00250   CmiLock(lock);
00251   Compute::patchReady(pid, doneMigration, seq);
00252   CmiUnlock(lock);
00253 }
00254 
00255 void CudaComputeNonbonded::gbisP2PatchReady(PatchID pid, int seq) {
00256   CmiLock(lock);
00257   Compute::gbisP2PatchReady(pid, seq);
00258   CmiUnlock(lock);
00259 }
00260 
00261 void CudaComputeNonbonded::gbisP3PatchReady(PatchID pid, int seq) {
00262   CmiLock(lock);
00263   Compute::gbisP3PatchReady(pid, seq);
00264   CmiUnlock(lock);
00265 }
00266 
00267 void CudaComputeNonbonded::assignPatch(int i) {
00268   PatchMap* patchMap = PatchMap::Object();
00269   PatchID pid = patches[i].patchID;
00270   Patch* patch = patchMap->patch(pid);
00271   if (patch == NULL) {
00272     // Create ProxyPatch if none exists
00273     ProxyMgr::Object()->createProxy(pid);
00274     patch = patchMap->patch(pid);
00275   }
00276   patches[i].patch = patch;
00277   if (patches[i].patch == NULL) {
00278     NAMD_bug("CudaComputeNonbonded::assignPatch, patch not found");
00279   }
00280   patches[i].positionBox = patches[i].patch->registerPositionPickup(this);
00281   patches[i].forceBox    = patches[i].patch->registerForceDeposit(this);
00282   SimParameters *simParams = Node::Object()->simParameters;
00283   if (simParams->GBISOn) {
00284     patches[i].intRadBox     = patches[i].patch->registerIntRadPickup(this);
00285     patches[i].psiSumBox     = patches[i].patch->registerPsiSumDeposit(this);
00286     patches[i].bornRadBox    = patches[i].patch->registerBornRadPickup(this);
00287     patches[i].dEdaSumBox    = patches[i].patch->registerDEdaSumDeposit(this);
00288     patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this);
00289   }
00290   // Store Pe where this patch was registered
00291 #if 1
00292   if (patches[i].pe != CkMyPe()) {
00293     NAMD_bug("CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
00294   }
00295 #else
00296   patches[i].pe = CkMyPe();
00297 #endif
00298   //
00299   patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00300   patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00301 }
00302 
00303 struct pid_sortop_reverse_priority {
00304   bool operator() (int2 pidj, int2 pidi) {  // i and j reversed
00305     int ppi = PATCH_PRIORITY(pidi.x);
00306     int ppj = PATCH_PRIORITY(pidj.x);
00307     if ( ppi != ppj ) return ppi < ppj;
00308     return pidi.x < pidj.x;
00309   }
00310 };
00311 
00312 void CudaComputeNonbonded::assignPatchesOnPe() {
00313   if (rankPatches[CkMyRank()].size() == 0)
00314     NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
00315 
00316   // calculate priority rank of local home patch within pe
00317   {
00318     PatchMap* patchMap = PatchMap::Object();
00319     ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
00320     for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
00321       int i = rankPatches[CkMyRank()][k];
00322       int pid = patches[i].patchID;
00323       int homePe = patchMap->node(pid);
00324       if ( CkNodeOf(homePe) == CkMyNode() ) {
00325         int2 pid_index;
00326         pid_index.x = pid;
00327         pid_index.y = i;
00328         homePatchByRank[CkRankOf(homePe)].add(pid_index);
00329       }
00330     }
00331     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00332       pid_sortop_reverse_priority so;
00333       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00334       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00335       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00336         int index = homePatchByRank[i][j].y;
00337         patches[index].reversePriorityRankInPe = j + masterBoost;
00338       }
00339     }
00340   }
00341 
00342   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00343     assignPatch(rankPatches[CkMyRank()][i]);
00344   }
00345 }
00346 
00347 //
00348 // Returns Pe of Patch ID "pid", -1 otherwise
00349 //
00350 // int findHomePatchPe(std::vector<PatchIDList>& rankPatchIDs, PatchID pid) {
00351 int findHomePatchPe(PatchIDList* rankPatchIDs, PatchID pid) {
00352   // for (int i=0;i < rankPatchIDs.size();i++) {
00353   for (int i=0;i < CkMyNodeSize();i++) {
00354     if (rankPatchIDs[i].find(pid) != -1) return CkNodeFirst(CkMyNode()) + i;
00355   }
00356   return -1;
00357 }
00358 
00359 //
00360 // Find all PEs that have Patch
00361 //
00362 void findProxyPatchPes(std::vector<int>& proxyPatchPes, PatchID pid) {
00363   proxyPatchPes.clear();
00364   for (int i=0;i < CkMyNodeSize();i++) {
00365     int pe = CkNodeFirst(CkMyNode()) + i;
00366     if (PatchMap::ObjectOnPe(pe)->patch(pid) != NULL)
00367       proxyPatchPes.push_back(pe);
00368   }
00369 }
00370 
00371 //
00372 // Called after all computes have been registered
00373 //
00374 void CudaComputeNonbonded::assignPatches(ComputeMgr* computeMgrIn) {
00375   // Remove duplicate patches
00376   std::sort(patches.begin(), patches.end());
00377   std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
00378   patches.erase(last, patches.end());
00379   // Set number of patches
00380   setNumPatches(patches.size());
00381   masterPe = CkMyPe();
00382   computeMgr = computeMgrIn;
00383   // Start patch counter
00384   patchesCounter = getNumPatches();
00385   // Patch ID map
00386   std::map<PatchID, int> pidMap;
00387 #if 1
00388   //-------------------------------------------------------
00389   // Copied in from ComputeNonbondedCUDA::assignPatches()
00390   //-------------------------------------------------------
00391 
00392   std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
00393   int numPesOnNodeSharingDevice = 0;
00394   int masterIndex = -1;
00395   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00396     int pe = deviceCUDA->getPesSharingDevice(i);
00397     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00398     if ( CkNodeOf(pe) == CkMyNode() ) {
00399       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00400     }
00401   }
00402 
00403   std::vector<int> count(patches.size(), 0);
00404   std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
00405   std::vector<int> rankpcount(CkMyNodeSize(), 0);
00406   std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
00407 
00408   PatchMap* patchMap = PatchMap::Object();
00409 
00410   int unassignedpatches = patches.size();
00411 
00412   for (int i=0;i < patches.size(); ++i) {
00413     patches[i].pe = -1;
00414   }
00415 
00416   // assign if home pe and build table of natural proxies
00417   for (int i=0;i < patches.size(); ++i) {
00418     int pid = patches[i].patchID;
00419     // homePe = PE where the patch currently resides
00420     int homePe = patchMap->node(pid);
00421     for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
00422       int pe = pesOnNodeSharingDevice[j];
00423       // If homePe is sharing this device, assign this patch to homePe
00424       if ( pe == homePe ) {
00425         patches[i].pe = pe;
00426         --unassignedpatches;
00427         pcount[j] += 1;
00428       }
00429       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00430         table[i*numPesOnNodeSharingDevice + j] = 1;
00431       }
00432     }
00433     // Assign this patch to homePe, if it resides on the same node
00434     if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00435       patches[i].pe = homePe;
00436       --unassignedpatches;
00437       rankpcount[CkRankOf(homePe)] += 1;
00438     }
00439   }
00440   // assign if only one pe has a required proxy
00441   for (int i=0; i < patches.size(); ++i) {
00442     int pid = patches[i].patchID;
00443     if ( patches[i].pe != -1 ) continue;
00444     int c = 0;
00445     int lastj;
00446     for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
00447       if ( table[i*numPesOnNodeSharingDevice + j] ) {
00448         ++c;
00449         lastj = j;
00450       }
00451     }
00452     count[i] = c;
00453     if ( c == 1 ) {
00454       patches[i].pe = pesOnNodeSharingDevice[lastj];
00455       --unassignedpatches;
00456       pcount[lastj] += 1;
00457     }
00458   }
00459   int assignj = 0;
00460   while ( unassignedpatches ) {
00461     int i;
00462     for (i=0;i < patches.size(); ++i) {
00463       if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
00464       int pid = patches[i].patchID;
00465       // patch_record &pr = patchRecords[pid];
00466       if ( patches[i].pe != -1 ) continue;
00467       patches[i].pe = pesOnNodeSharingDevice[assignj];
00468       --unassignedpatches;
00469       pcount[assignj] += 1;
00470       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00471       break;
00472     }
00473     if (i < patches.size() ) continue;  // start search again
00474     for ( i=0;i < patches.size(); ++i ) {
00475       int pid = patches[i].patchID;
00476       // patch_record &pr = patchRecords[pid];
00477       if ( patches[i].pe != -1 ) continue;
00478       if ( count[i] ) continue;
00479       patches[i].pe = pesOnNodeSharingDevice[assignj];
00480       --unassignedpatches;
00481       pcount[assignj] += 1;
00482       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00483       break;
00484     }
00485     if ( i < patches.size() ) continue;  // start search again
00486     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00487   }
00488 
00489   // For each rank, list of patches
00490   rankPatches.resize(CkMyNodeSize());
00491   for (int i=0; i < patches.size(); ++i) {
00492     rankPatches[CkRankOf(patches[i].pe)].push_back(i);
00493     pidMap[patches[i].patchID] = i;
00494   }
00495 
00496   // for ( int i=0; i < patches.size(); ++i ) {
00497   //   CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
00498   // }
00499 
00500 /*
00501   slavePes = new int[CkMyNodeSize()];
00502   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00503   numSlaves = 0;
00504   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00505     int pe = pesOnNodeSharingDevice[j];
00506     int rank = pe - CkNodeFirst(CkMyNode());
00507     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00508     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00509     if ( pe == CkMyPe() ) continue;
00510     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00511     rankpcount[rank] = 0;  // skip in rank loop below
00512     slavePes[numSlaves] = pe;
00513     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00514     ++numSlaves;
00515   }
00516   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00517     int pe = CkNodeFirst(CkMyNode()) + j;
00518     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00519     //          CkMyPe(),j,pe,rankpcount[j]);
00520     if ( ! rankpcount[j] ) continue;
00521     if ( pe == CkMyPe() ) continue;
00522     slavePes[numSlaves] = pe;
00523     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00524     ++numSlaves;
00525   }
00526 */
00527 
00528 #else
00529   // For each rank, list of patches
00530   rankPatches.resize(CkMyNodeSize());
00531   // For each rank, list of home patch IDs
00532   PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
00533   for (int i=0;i < CkMyNodeSize();i++) {
00534     int pe = CkNodeFirst(CkMyNode()) + i;
00535     PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
00536   }
00537   std::vector<int> proxyPatchPes;
00538   std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
00539   //--------------------------------------------------------
00540   // Build a list of PEs to avoid
00541   std::vector<int> pesToAvoid;
00542 #if 0
00543   // Avoid other GPUs' master PEs
00544   for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
00545     int pe = deviceCUDA->getMasterPeForDeviceID(i);
00546     if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
00547   }
00548   // Avoid PEs that are involved in PME
00549   ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
00550   for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
00551     if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
00552   }
00553   // Set counters of avoidable PEs to high numbers
00554   for (int i=0;i < pesToAvoid.size();i++) {
00555     int pe = pesToAvoid[i];
00556     peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);    
00557   }
00558 #endif
00559   // Avoid master Pe somewhat
00560   peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
00561   //--------------------------------------------------------
00562   for (int i=0;i < patches.size();i++) {
00563     PatchID pid = patches[i].patchID;
00564     int pe = findHomePatchPe(rankHomePatchIDs, pid);
00565     if (pe == -1) {
00566       // Patch not present on this node => try finding a ProxyPatch
00567       findProxyPatchPes(proxyPatchPes, pid);
00568       if (proxyPatchPes.size() == 0) {
00569         // No ProxyPatch => create one on rank that has the least ProxyPatches
00570         int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00571         pe = CkNodeFirst(CkMyNode()) + rank;
00572         peProxyPatchCounter[rank]++;
00573       } else {
00574         // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
00575         // this is done by finding the entry with minimum peProxyPatchCounter -value
00576         // Find miniumum among proxyPatchPes, i.e., find the minimum among
00577         // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
00578         // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
00579         //   [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
00580         //   - proxyPatchPes.begin();
00581         // pe = proxyPatchPes[pppi];
00582         int minCounter = (1 << 30);
00583         for (int j=0;j < proxyPatchPes.size();j++) {
00584           if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
00585             pe = proxyPatchPes[j];
00586             minCounter = peProxyPatchCounter[CkRankOf(pe)];
00587           }
00588         }
00589         if (pe == -1)
00590           NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
00591         peProxyPatchCounter[CkRankOf(pe)]++;
00592       }
00593     } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
00594       // Found home patch on this node, but it's on PE that should be avoided => find a new one
00595       int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00596       pe = CkNodeFirst(CkMyNode()) + rank;
00597       peProxyPatchCounter[rank]++;
00598     }
00599     if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
00600       NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
00601     rankPatches[CkRankOf(pe)].push_back(i);
00602     pidMap[pid] = i;
00603   }
00604 
00605   delete [] rankHomePatchIDs;
00606 #endif
00607   // Setup computes using pidMap
00608   for (int i=0;i < computes.size();i++) {
00609     computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
00610     computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
00611   }
00612   for (int i=0;i < CkMyNodeSize();i++) {
00613     if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
00614   }
00615   computeMgr->sendAssignPatchesOnPe(pes, this);
00616 }
00617 
00618 void CudaComputeNonbonded::initialize() {
00619   if (patches.size() > 0) {
00620     // Allocate CUDA version of patches
00621     cudaCheck(cudaSetDevice(deviceID));
00622     allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
00623 
00624     allocate_host<VirialEnergy>(&h_virialEnergy, 1);
00625     allocate_device<VirialEnergy>(&d_virialEnergy, 1);
00626 
00627 #if CUDA_VERSION >= 5050
00628     int leastPriority, greatestPriority;
00629     cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
00630     int priority = (doStreaming) ? leastPriority : greatestPriority;
00631     // int priority = greatestPriority;
00632     cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
00633 #else
00634     cudaCheck(cudaStreamCreate(&stream));
00635 #endif
00636     cudaCheck(cudaEventCreate(&forceDoneEvent));
00637 
00638     buildExclusions();
00639 
00640     lock = CmiCreateLock();
00641 
00642     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00643   }  
00644 }
00645 
00646 //
00647 // atomUpdate() can be called by any Pe
00648 //
00649 void CudaComputeNonbonded::atomUpdate() {
00650   atomsChangedIn = true;
00651 }
00652 
00653 //
00654 // Compute patches[].atomStart, patches[].numAtoms, patches[].numFreeAtoms, and atomStorageSize
00655 //
00656 void CudaComputeNonbonded::updatePatches() {
00657 
00658   // Maximum number of tiles per tile list
00659   maxTileListLen = 0;
00660   int atomStart = 0;
00661   for (int i=0;i < patches.size();i++) {
00662     patches[i].atomStart = atomStart;
00663     cudaPatches[i].atomStart = atomStart;
00664     int numAtoms = patches[i].numAtoms;
00665     int numTiles = ((numAtoms-1)/WARPSIZE+1);
00666     maxTileListLen = std::max(maxTileListLen, numTiles);
00667     atomStart += numTiles*WARPSIZE;
00668   }
00669   atomStorageSize = atomStart;
00670 
00671   if (maxTileListLen >= 65536) {
00672     NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
00673   }
00674 }
00675 
00676 void CudaComputeNonbonded::skipPatch(int i) {
00677   if (CkMyPe() != patches[i].pe)
00678     NAMD_bug("CudaComputeNonbonded::skipPatch called on wrong Pe");
00679   Flags &flags = patches[i].patch->flags;
00680   patches[i].positionBox->skip();
00681   patches[i].forceBox->skip();
00682   if (flags.doGBIS) {
00683     patches[i].psiSumBox->skip();
00684     patches[i].intRadBox->skip();
00685     patches[i].bornRadBox->skip();
00686     patches[i].dEdaSumBox->skip();
00687     patches[i].dHdrPrefixBox->skip();
00688   }
00689 }
00690 
00691 void CudaComputeNonbonded::skipPatchesOnPe() {
00692   if (rankPatches[CkMyRank()].size() == 0)
00693     NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
00694   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00695     skipPatch(rankPatches[CkMyRank()][i]);
00696   }
00697   bool done = false;
00698   CmiLock(lock);
00699   patchesCounter -= rankPatches[CkMyRank()].size();
00700   if (patchesCounter == 0) {
00701     patchesCounter = getNumPatches();
00702     done = true;
00703   }
00704   CmiUnlock(lock);
00705   if (done) {
00706     // Reduction must be done on masterPe
00707     computeMgr->sendFinishReductions(masterPe, this);
00708   }
00709 }
00710 
00711 void CudaComputeNonbonded::skip() {
00712   if (CkMyPe() != masterPe)
00713     NAMD_bug("CudaComputeNonbonded::skip() called on non masterPe");
00714 
00715   if (patches.size() == 0) return;
00716 
00717   doSkip = true;
00718 
00719   computeMgr->sendSkipPatchesOnPe(pes, this);
00720 }
00721 
00722 void CudaComputeNonbonded::getMaxMovementTolerance(float& maxAtomMovement, float& maxPatchTolerance) {
00723   if (CkMyPe() != masterPe)
00724     NAMD_bug("CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
00725 
00726   for (int i=0;i < patches.size();++i) {
00727     PatchRecord &pr = patches[i];
00728 
00729     float maxMove = pr.patch->flags.maxAtomMovement;
00730     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
00731 
00732     float maxTol = pr.patch->flags.pairlistTolerance;
00733     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
00734   }
00735 }
00736 
00737 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(int first, int last, void *result, int paraNum, void *param) {
00738   CudaComputeNonbonded* c = (CudaComputeNonbonded *)param;
00739   c->updateVdwTypesExclSubset(first, last);
00740 }
00741 
00742 void CudaComputeNonbonded::updateVdwTypesExclSubset(int first, int last) {
00743   for (int i=first;i <= last;i++) {
00744     PatchRecord &pr = patches[i];
00745     int start = pr.atomStart;
00746     int numAtoms = pr.numAtoms;
00747     const CompAtom *compAtom = pr.compAtom;
00748     const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
00749     // Atoms have changed, re-do exclusions and vdw types
00750     int2* exclp = exclIndexMaxDiff + start;
00751     int* aip = atomIndex + start;
00752     for ( int k=0;k < numAtoms; ++k ) {
00753       int j = compAtomExt[k].sortOrder;
00754       vdwTypes[start + k] = compAtom[j].vdwType;
00755       aip[k] = compAtomExt[j].id;
00756 #ifdef MEM_OPT_VERSION
00757       exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
00758       exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
00759 #else // ! MEM_OPT_VERSION
00760       exclp[k].x = exclusionsByAtom[compAtomExt[j].id].y;
00761       exclp[k].y = exclusionsByAtom[compAtomExt[j].id].x;
00762 #endif // MEM_OPT_VERSION
00763     }
00764   }
00765 }
00766 
00767 //
00768 // Called every time atoms changed
00769 //
00770 void CudaComputeNonbonded::updateVdwTypesExcl() {
00771   // Re-allocate (VdwTypes, exclIndexMaxDiff) as needed
00772   reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
00773   reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
00774   reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
00775 
00776 #if CMK_SMP && USE_CKLOOP
00777   int useCkLoop = Node::Object()->simParameters->useCkLoop;
00778   if (useCkLoop >= 1) {
00779     CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (void *)this, CkMyNodeSize(), 0, patches.size()-1);
00780   } else
00781 #endif
00782   {
00783     updateVdwTypesExclSubset(0, patches.size()-1);
00784   }
00785 
00786   nonbondedKernel.updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
00787 }
00788 
00789 inline void CudaComputeNonbonded::copyAtomsLoop(int first, int last, void *result, int paraNum, void *param) {
00790   CudaComputeNonbonded* c = (CudaComputeNonbonded *)param;
00791   c->copyAtomsSubset(first, last);
00792 }
00793 
00794 void CudaComputeNonbonded::copyAtomsSubset(int first, int last) {
00795   for (int i=first;i <= last;++i) {
00796     PatchRecord &pr = patches[i];
00797     int numAtoms = pr.numAtoms;
00798     if (numAtoms > 0) {
00799       int start = pr.atomStart;
00800       const CudaAtom *src = pr.patch->getCudaAtomList();
00801       CudaAtom *dst = atoms + start;
00802       memcpy(dst, src, sizeof(CudaAtom)*numAtoms);
00803       // Fill the rest with the copy of the last atom
00804       int numAtomsAlign = ((numAtoms-1)/32+1)*32;
00805       CudaAtom lastAtom = src[numAtoms-1];
00806       for (int j=numAtoms;j < numAtomsAlign;j++) {
00807         dst[j] = lastAtom;
00808       }
00809     }
00810   }
00811 }
00812 
00813 void CudaComputeNonbonded::copyGBISphase(int i) {
00814   if (CkMyPe() != patches[i].pe)
00815     NAMD_bug("CudaComputeNonbonded::copyGBISphase called on wrong Pe");
00816   PatchRecord &pr = patches[i];
00817   const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
00818   if (gbisPhase == 1) {
00819     //Copy GBIS intRadius to Host
00820     if (atomsChanged) {
00821       float *intRad0 = intRad0H + pr.atomStart;
00822       float *intRadS = intRadSH + pr.atomStart;
00823       for (int k=0;k < pr.numAtoms;++k) {
00824         int j = aExt[k].sortOrder;
00825         intRad0[k] = pr.intRad[2*j+0];
00826         intRadS[k] = pr.intRad[2*j+1];
00827       }
00828     }
00829   } else if (gbisPhase == 2) {
00830     float *bornRad = bornRadH + pr.atomStart;
00831     for ( int k=0; k < pr.numAtoms; ++k ) {
00832       int j = aExt[k].sortOrder;
00833       bornRad[k] = pr.bornRad[j];
00834     }
00835   } else if (gbisPhase == 3) {
00836     float *dHdrPrefix = dHdrPrefixH + pr.atomStart;
00837     for ( int k=0; k < pr.numAtoms; ++k ) {
00838       int j = aExt[k].sortOrder;
00839       dHdrPrefix[k] = pr.dHdrPrefix[j];
00840     }
00841   } // end phases
00842 }
00843 
00844 void CudaComputeNonbonded::openBox(int i) {
00845   if (CkMyPe() != patches[i].pe)
00846     NAMD_bug("CudaComputeNonbonded::openBox called on wrong Pe");
00847   SimParameters *simParams = Node::Object()->simParameters;
00848   if (!simParams->GBISOn || gbisPhase == 1) {
00849     patches[i].compAtom = patches[i].positionBox->open();
00850     copyAtomsSubset(i, i);
00851   }
00852   if (simParams->GBISOn) {
00853     if (gbisPhase == 1) {
00854       patches[i].intRad     = patches[i].intRadBox->open();
00855       patches[i].psiSum     = patches[i].psiSumBox->open();
00856     } else if (gbisPhase == 2) {
00857       patches[i].bornRad    = patches[i].bornRadBox->open();
00858       patches[i].dEdaSum    = patches[i].dEdaSumBox->open();
00859     } else if (gbisPhase == 3) {
00860       patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
00861     }
00862     copyGBISphase(i);
00863   }
00864 }
00865 
00866 void CudaComputeNonbonded::messageEnqueueWork() {
00867   if (masterPe != CkMyPe())
00868     NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
00869   WorkDistrib::messageEnqueueWork(this);
00870 }
00871 
00872 void CudaComputeNonbonded::openBoxesOnPe() {
00873   if (rankPatches[CkMyRank()].size() == 0)
00874     NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
00875   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00876     openBox(rankPatches[CkMyRank()][i]);
00877   }
00878   bool done = false;
00879   CmiLock(lock);
00880   patchesCounter -= rankPatches[CkMyRank()].size();
00881   if (patchesCounter == 0) {
00882     patchesCounter = getNumPatches();
00883     done = true;
00884   }
00885   CmiUnlock(lock);
00886   if (done) {
00887     computeMgr->sendLaunchWork(masterPe, this);
00888   }
00889 }
00890 
00891 int CudaComputeNonbonded::noWork() {
00892   // Simply enqueu doWork on masterPe and return "no work"
00893   computeMgr->sendMessageEnqueueWork(masterPe, this);
00894   return 1;
00895 }
00896 
00897 void CudaComputeNonbonded::reallocateArrays() {
00898   cudaCheck(cudaSetDevice(deviceID));
00899   SimParameters *simParams = Node::Object()->simParameters;
00900 
00901   // Re-allocate atoms
00902   reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
00903 
00904   // Re-allocate forces
00905   if (doStreaming) {
00906     reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
00907     reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
00908   } else {
00909     reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
00910     reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
00911   }
00912   reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
00913   reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);  
00914 
00915   if (simParams->GBISOn) {
00916     reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
00917     reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
00918     reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
00919     reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
00920     reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
00921     reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
00922   }
00923 }
00924 
00925 void CudaComputeNonbonded::doWork() {
00926   if (CkMyPe() != masterPe)
00927     NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
00928 
00929   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
00930   // atomsChangedIn can be set to true by any Pe
00931   // atomsChanged can only be set by masterPe
00932   // This use of double varibles makes sure we don't have race condition
00933   atomsChanged = atomsChangedIn;
00934   atomsChangedIn = false;
00935 
00936   SimParameters *simParams = Node::Object()->simParameters;
00937 
00938   if (patches.size() == 0) return;  // No work do to
00939 
00940   // Take the flags from the first patch on this Pe
00941   // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
00942   Flags &flags = patches[0].patch->flags;
00943 
00944   doSlow = flags.doFullElectrostatics;
00945   doEnergy = flags.doEnergy;
00946   doVirial = flags.doVirial;
00947 
00948   if (flags.doNonbonded) {
00949 
00950     if (simParams->GBISOn) {
00951       gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
00952     }
00953 
00954     if (!simParams->GBISOn || gbisPhase == 1) {
00955       if ( computesChanged ) {
00956         updateComputes();
00957       }
00958       if (atomsChanged) {
00959         // Re-calculate patch atom numbers and storage
00960         updatePatches();
00961         reSortDone = false;
00962       }
00963       reallocateArrays();
00964     }
00965 
00966     // Open boxes on Pes and launch work to masterPe
00967     computeMgr->sendOpenBoxesOnPe(pes, this);
00968 
00969   } else {
00970     // No work to do, skip
00971     skip();
00972   }
00973 
00974 }
00975 
00976 void CudaComputeNonbonded::launchWork() {
00977   if (CkMyPe() != masterPe)
00978     NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
00979 
00980   beforeForceCompute = CkWallTimer();
00981 
00982   cudaCheck(cudaSetDevice(deviceID));
00983   SimParameters *simParams = Node::Object()->simParameters;
00984 
00985   //execute only during GBIS phase 1, or if not using GBIS
00986   if (!simParams->GBISOn || gbisPhase == 1) {
00987 
00988     if ( atomsChanged || computesChanged ) {
00989       // Invalidate pair lists
00990       pairlistsValid = false;
00991       pairlistTolerance = 0.0f;
00992     }
00993 
00994     // Get maximum atom movement and patch tolerance
00995     float maxAtomMovement = 0.0f;
00996     float maxPatchTolerance = 0.0f;
00997     getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
00998     // Update pair-list cutoff
00999     Flags &flags = patches[0].patch->flags;
01000     savePairlists = false;
01001     usePairlists = false;
01002     if ( flags.savePairlists ) {
01003       savePairlists = true;
01004       usePairlists = true;
01005     } else if ( flags.usePairlists ) {
01006       if ( ! pairlistsValid ||
01007            ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01008         reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01009       } else {
01010         usePairlists = true;
01011       }
01012     }
01013     if ( ! usePairlists ) {
01014       pairlistsValid = false;
01015     }
01016     float plcutoff = cutoff;
01017     if ( savePairlists ) {
01018       pairlistsValid = true;
01019       pairlistTolerance = 2. * maxPatchTolerance;
01020       plcutoff += pairlistTolerance;
01021     }
01022     plcutoff2 = plcutoff * plcutoff;
01023 
01024     // if (atomsChanged)
01025     //   CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01026     //     plcutoff, pairlistTolerance, savePairlists, usePairlists);
01027 
01028   } // if (!simParams->GBISOn || gbisPhase == 1)
01029 
01030   // Calculate PME & VdW forces
01031   if (!simParams->GBISOn || gbisPhase == 1) {
01032     doForce();
01033     if (doStreaming) {
01034       patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
01035       patchReadyQueueLen = tileListKernel.getNumPatches();
01036       patchReadyQueueNext = 0;
01037       // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
01038       int numEmptyPatches = tileListKernel.getNumEmptyPatches();
01039       int* emptyPatches = tileListKernel.getEmptyPatches();
01040       for (int i=0;i < numEmptyPatches;i++) {
01041         patchReadyQueue[i] = emptyPatches[i];
01042       }
01043       if (patchReadyQueueLen != patches.size())
01044         NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
01045     }
01046   }
01047 
01048   // For GBIS phase 1 at pairlist update, we must re-sort tile list
01049   // before calling doGBISphase1().
01050   if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
01051     // In this code path doGBISphase1() is called in forceDone()
01052     forceDoneSetCallback();
01053     return;
01054   }
01055 
01056   // GBIS Phases
01057   if (simParams->GBISOn) {
01058     if (gbisPhase == 1) {
01059       doGBISphase1();
01060     } else if (gbisPhase == 2) {
01061       doGBISphase2(); 
01062     } else if (gbisPhase == 3) {
01063       doGBISphase3(); 
01064     }
01065   }
01066 
01067   // Copy forces to host
01068   if (!simParams->GBISOn || gbisPhase == 3) {
01069     if (!doStreaming) {
01070       copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
01071       if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
01072     }
01073   }
01074 
01075   if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
01076     // For GBIS, energies are ready after phase 2
01077     nonbondedKernel.reduceVirialEnergy(tileListKernel,
01078       atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
01079       d_forces, d_forcesSlow, d_virialEnergy, stream);
01080     copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
01081   }
01082 
01083   // Setup call back
01084   forceDoneSetCallback();
01085 }
01086 
01087 //
01088 // GBIS Phase 1
01089 //
01090 void CudaComputeNonbonded::doGBISphase1() {
01091   cudaCheck(cudaSetDevice(deviceID));
01092   
01093   if (atomsChanged) {
01094     GBISKernel.updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
01095   }
01096 
01097   SimParameters *simParams = Node::Object()->simParameters;
01098   Lattice lattice = patches[0].patch->flags.lattice;
01099 
01100   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01101   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01102   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01103 
01104   GBISKernel.GBISphase1(tileListKernel, atomStorageSize,
01105     lata, latb, latc,
01106     simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream);
01107 }
01108 
01109 //
01110 // GBIS Phase 2
01111 //
01112 void CudaComputeNonbonded::doGBISphase2() {
01113   cudaCheck(cudaSetDevice(deviceID));
01114 
01115   SimParameters *simParams = Node::Object()->simParameters;
01116   Lattice lattice = patches[0].patch->flags.lattice;
01117 
01118   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01119   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01120   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01121 
01122   GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream);
01123 
01124   GBISKernel.GBISphase2(tileListKernel, atomStorageSize,
01125     doEnergy, doSlow,
01126     lata, latb, latc,
01127     simParams->cutoff, simParams->nonbondedScaling, simParams->kappa,
01128     (simParams->switchingActive ? simParams->switchingDist : -1.0),
01129     simParams->dielectric, simParams->solvent_dielectric,
01130     d_forces, dEdaSumH, stream);
01131 }
01132 
01133 //
01134 // GBIS Phase 3
01135 //
01136 void CudaComputeNonbonded::doGBISphase3() {
01137   cudaCheck(cudaSetDevice(deviceID));
01138   SimParameters *simParams = Node::Object()->simParameters;
01139   Lattice lattice = patches[0].patch->flags.lattice;
01140 
01141   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01142   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01143   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01144 
01145   if (doSlow) {
01146     GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream);
01147 
01148     GBISKernel.GBISphase3(tileListKernel, atomStorageSize,
01149       lata, latb, latc,
01150       simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream);
01151   }
01152 }
01153 
01154 //
01155 // Calculate electrostatic & VdW forces
01156 //
01157 void CudaComputeNonbonded::doForce() {
01158   cudaCheck(cudaSetDevice(deviceID));
01159 
01160   Lattice lattice = patches[0].patch->flags.lattice;
01161   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01162   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01163   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01164 
01165   if (atomsChanged) {
01166     int numTileLists = calcNumTileLists();
01167     // Build initial tile lists and sort
01168     tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize,
01169       maxTileListLen, lata, latb, latc,
01170       cudaPatches, (const float4*)atoms, plcutoff2, stream);
01171     // Prepare tile list for atom-based refinement
01172     tileListKernel.prepareTileList(stream);
01173   }
01174 
01175   if (atomsChanged) {
01176     // Update Vdw types and exclusion index & maxdiff
01177     updateVdwTypesExcl();
01178   }
01179 
01180   beforeForceCompute = CkWallTimer();
01181 
01182   // Calculate forces (and refine tile list if atomsChanged=true)
01183   nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize, atomsChanged,
01184     doEnergy, doVirial, doSlow, lata, latb, latc,
01185     (const float4*)atoms, cutoff2, d_forces, d_forcesSlow, h_forces, h_forcesSlow,
01186     stream);
01187 
01188   if (atomsChanged) {
01189     tileListKernel.finishTileList(stream);
01190   }
01191 
01192   traceUserBracketEvent(CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
01193 }
01194 
01195 //
01196 // Count an upper estimate for the number of tile lists
01197 //
01198 int CudaComputeNonbonded::calcNumTileLists() {
01199   int numTileLists = 0;
01200   for (int i=0;i < computes.size();i++) {
01201     int pi1 = computes[i].patchInd[0];
01202     int numAtoms1 = patches[pi1].numAtoms;
01203     int numTiles1 = (numAtoms1-1)/WARPSIZE+1;
01204     numTileLists += numTiles1;
01205   }
01206   return numTileLists;
01207 }
01208 
01209 //
01210 // Finish & submit reductions
01211 //
01212 void CudaComputeNonbonded::finishReductions() {
01213 
01214   if (CkMyPe() != masterPe)
01215     NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
01216   
01217   // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
01218 
01219   if (!doSkip) {
01220 
01221     if (doStreaming && (doVirial || doEnergy)) {
01222       // For streaming kernels, we must wait for virials and forces to be copied back to CPU
01223       if (!forceDoneEventRecord)
01224         NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
01225       cudaCheck(cudaEventSynchronize(forceDoneEvent));
01226       forceDoneEventRecord = false;
01227     }
01228 
01229     if (doVirial) {
01230       Tensor virialTensor;
01231       virialTensor.xx = h_virialEnergy->virial[0];
01232       virialTensor.xy = h_virialEnergy->virial[1];
01233       virialTensor.xz = h_virialEnergy->virial[2];
01234       virialTensor.yx = h_virialEnergy->virial[3];
01235       virialTensor.yy = h_virialEnergy->virial[4];
01236       virialTensor.yz = h_virialEnergy->virial[5];
01237       virialTensor.zx = h_virialEnergy->virial[6];
01238       virialTensor.zy = h_virialEnergy->virial[7];
01239       virialTensor.zz = h_virialEnergy->virial[8];
01240       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01241       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01242       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01243       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
01244       if (doSlow) {
01245         Tensor virialTensor;
01246         virialTensor.xx = h_virialEnergy->virialSlow[0];
01247         virialTensor.xy = h_virialEnergy->virialSlow[1];
01248         virialTensor.xz = h_virialEnergy->virialSlow[2];
01249         virialTensor.yx = h_virialEnergy->virialSlow[3];
01250         virialTensor.yy = h_virialEnergy->virialSlow[4];
01251         virialTensor.yz = h_virialEnergy->virialSlow[5];
01252         virialTensor.zx = h_virialEnergy->virialSlow[6];
01253         virialTensor.zy = h_virialEnergy->virialSlow[7];
01254         virialTensor.zz = h_virialEnergy->virialSlow[8];
01255         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01256         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01257         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01258         ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
01259       }
01260     }
01261     if (doEnergy) {
01262       // if (doSlow)
01263       //   printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
01264       SimParameters *simParams = Node::Object()->simParameters;
01265       reduction->item(REDUCTION_LJ_ENERGY)    += h_virialEnergy->energyVdw;
01266       reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
01267       // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
01268       if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
01269       // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
01270     }
01271 
01272     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
01273   }
01274   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
01275   reduction->submit();
01276 
01277   // Reset flags
01278   doSkip = false;
01279   computesChanged = false;
01280 }
01281 
01282 //
01283 // Finish a single patch
01284 //
01285 void CudaComputeNonbonded::finishPatch(int i) {
01286   if (CkMyPe() != patches[i].pe)
01287     NAMD_bug("CudaComputeNonbonded::finishPatch called on wrong Pe");
01288 
01289   PatchRecord &pr = patches[i];
01290   pr.results = pr.forceBox->open();
01291 
01292   const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
01293   int atomStart = pr.atomStart;
01294   int numAtoms = pr.numAtoms;
01295   if (numAtoms > 0) {
01296     Force *f      = pr.results->f[Results::nbond];
01297     Force *f_slow = pr.results->f[Results::slow];
01298     float4 *af      = h_forces + atomStart;
01299     float4 *af_slow = h_forcesSlow + atomStart;
01300     // float maxf = 0.0f;
01301     // int maxf_k;
01302     for ( int k=0; k<numAtoms; ++k ) {
01303       int j = aExt[k].sortOrder;
01304       f[j].x += af[k].x;
01305       f[j].y += af[k].y;
01306       f[j].z += af[k].z;
01307       // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
01308       //   maxf = std::max(maxf, fabsf(af[k].x));
01309       //   maxf = std::max(maxf, fabsf(af[k].y));
01310       //   maxf = std::max(maxf, fabsf(af[k].z));
01311       //   maxf_k = k;
01312       // }
01313       if ( doSlow ) {
01314         f_slow[j].x += af_slow[k].x;
01315         f_slow[j].y += af_slow[k].y;
01316         f_slow[j].z += af_slow[k].z;
01317       }
01318     }
01319     // if (maxf > 10000.0f) {
01320     //   fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
01321     //   cudaCheck(cudaStreamSynchronize(stream));
01322     //   NAMD_die("maxf!");
01323     // }
01324   }
01325 
01326   pr.positionBox->close(&(pr.compAtom));
01327   pr.forceBox->close(&(pr.results));
01328 }
01329 
01330 //
01331 // Finish a set of patches on this pe
01332 //
01333 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
01334   if (patchSet.size() == 0)
01335     NAMD_bug("CudaComputeNonbonded::finishPatchesOnPe, empty rank");
01336   SimParameters *simParams = Node::Object()->simParameters;
01337   // Save value of gbisPhase here because it can change after the last finishGBISPhase() or finishPatch() is called
01338   int gbisPhaseSave = gbisPhase;
01339   // Close Boxes depending on Phase
01340   if (simParams->GBISOn) {
01341     for (int i=0;i < patchSet.size();i++) {
01342       finishGBISPhase(patchSet[i]);
01343     }
01344   }
01345   // Finish patches
01346   if (!simParams->GBISOn || gbisPhaseSave == 3) {
01347     for (int i=0;i < patchSet.size();i++) {
01348       finishPatch(patchSet[i]);
01349     }
01350   }
01351   bool done = false;
01352   CmiLock(lock);
01353   patchesCounter -= patchSet.size();
01354   if (patchesCounter == 0) {
01355     patchesCounter = getNumPatches();
01356     done = true;
01357   }
01358   CmiUnlock(lock);
01359   if (done) {
01360     // Do reductions
01361     if (!simParams->GBISOn || gbisPhaseSave == 3) {
01362       // Reduction must be done on masterPe
01363       computeMgr->sendFinishReductions(masterPe, this);
01364     }
01365   }
01366 }
01367 
01368 //
01369 // Finish all patches that are on this pe
01370 //
01371 void CudaComputeNonbonded::finishPatchesOnPe() {
01372   finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
01373 }
01374 
01375 //
01376 // Finish single patch on this pe
01377 //
01378 void CudaComputeNonbonded::finishPatchOnPe(int i) {
01379   std::vector<int> v(1, i);
01380   finishSetOfPatchesOnPe(v);
01381 }
01382 
01383 void CudaComputeNonbonded::finishPatches() {
01384   computeMgr->sendFinishPatchesOnPe(pes, this);
01385 }
01386 
01387 void CudaComputeNonbonded::finishGBISPhase(int i) {
01388   if (CkMyPe() != patches[i].pe)
01389     NAMD_bug("CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
01390   PatchRecord &pr = patches[i];
01391   const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
01392   int atomStart = pr.atomStart;
01393   if (gbisPhase == 1) {
01394     GBReal *psiSumMaster = psiSumH + atomStart;
01395     for ( int k=0; k<pr.numAtoms; ++k ) {
01396       int j = aExt[k].sortOrder;
01397       pr.psiSum[j] += psiSumMaster[k];
01398     }
01399     pr.psiSumBox->close(&(pr.psiSum));
01400   } else if (gbisPhase == 2) {
01401     GBReal *dEdaSumMaster = dEdaSumH + atomStart;
01402     for ( int k=0; k<pr.numAtoms; ++k ) {
01403       int j = aExt[k].sortOrder;
01404       pr.dEdaSum[j] += dEdaSumMaster[k];
01405     }
01406     pr.dEdaSumBox->close(&(pr.dEdaSum));
01407   } else if (gbisPhase == 3) {
01408     pr.intRadBox->close(&(pr.intRad)); //box 6
01409     pr.bornRadBox->close(&(pr.bornRad)); //box 7
01410     pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01411   } //end phases
01412 }
01413 
01414 void CudaComputeNonbonded::finishTimers() {
01415   SimParameters *simParams = Node::Object()->simParameters;
01416 
01417   if (simParams->GBISOn) {
01418     if (gbisPhase == 1)
01419       traceUserBracketEvent(CUDA_GBIS1_KERNEL, beforeForceCompute, CkWallTimer());
01420     if (gbisPhase == 2)
01421       traceUserBracketEvent(CUDA_GBIS2_KERNEL, beforeForceCompute, CkWallTimer());
01422     if (gbisPhase == 3)
01423       traceUserBracketEvent(CUDA_GBIS3_KERNEL, beforeForceCompute, CkWallTimer());
01424   } else {
01425     traceUserBracketEvent(CUDA_VDW_KERNEL, beforeForceCompute, CkWallTimer());
01426   }  
01427 }
01428 
01429 //
01430 // Re-sort tile lists if neccessary
01431 //
01432 void CudaComputeNonbonded::reSortTileLists() {
01433   // Re-sort tile lists
01434   SimParameters *simParams = Node::Object()->simParameters;
01435   cudaCheck(cudaSetDevice(deviceID));
01436   tileListKernel.reSortTileLists(simParams->GBISOn, stream);
01437 }
01438 
01439 void CudaComputeNonbonded::forceDoneCheck(void *arg, double walltime) {
01440   CudaComputeNonbonded* c = (CudaComputeNonbonded *)arg;
01441 
01442   if (CkMyPe() != c->masterPe)
01443     NAMD_bug("CudaComputeNonbonded::forceDoneCheck called on non masterPe");
01444 
01445   SimParameters *simParams = Node::Object()->simParameters;
01446   cudaCheck(cudaSetDevice(c->deviceID));
01447 
01448   if (c->doStreaming) {
01449     int patchInd;
01450     while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
01451       c->patchReadyQueue[c->patchReadyQueueNext] = -1;
01452       c->patchReadyQueueNext++;
01453       c->checkCount = 0;
01454 
01455       if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
01456         c->finishTimers();
01457         if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
01458           c->reSortTileLists();
01459           c->reSortDone = true;
01460           if (simParams->GBISOn && c->gbisPhase == 1) {
01461             // We must do GBIS Phase 1
01462             c->doGBISphase1();
01463             c->forceDoneSetCallback();
01464             return;
01465           }
01466         }
01467       }
01468 
01469       // Finish patch
01470       int pe = c->patches[patchInd].pe;
01471       c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd);
01472 
01473       // Last patch, return
01474       if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) return;
01475 
01476     }
01477   } else {
01478     if (!c->forceDoneEventRecord)
01479       NAMD_bug("CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
01480     cudaError_t err = cudaEventQuery(c->forceDoneEvent);
01481     if (err == cudaSuccess) {
01482       // Event has occurred
01483       c->forceDoneEventRecord = false;
01484       c->checkCount = 0;
01485       c->finishTimers();
01486       if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
01487         c->reSortTileLists();
01488         c->reSortDone = true;
01489         if (simParams->GBISOn && c->gbisPhase == 1) {
01490           // We must do GBIS Phase 1
01491           c->doGBISphase1();
01492           c->forceDoneSetCallback();
01493           return;
01494         }
01495       }
01496       c->finishPatches();
01497       return;
01498     } else if (err != cudaErrorNotReady) {
01499       // Anything else is an error
01500       cudaCheck(err);
01501       // NAMD_bug("CudaComputeNonbonded::forceDoneCheck, cudaEventQuery returned error");
01502     }
01503   }
01504 
01505   // if (c->checkCount % 1000 == 0)
01506   //   fprintf(stderr, "c->patchReadyQueueNext %d\n", c->patchReadyQueueNext);
01507 
01508   // Event has not occurred
01509   c->checkCount++;
01510   if (c->checkCount >= 1000000) {
01511     NAMD_bug("CudaComputeNonbonded::forceDoneCheck, check count exceeded");
01512   }
01513 
01514   // Call again 
01515   CcdCallBacksReset(0, walltime);
01516   CcdCallFnAfter(forceDoneCheck, arg, 0.1);
01517 }
01518 
01519 //
01520 // Set call back for all the work in the stream at this point
01521 //
01522 void CudaComputeNonbonded::forceDoneSetCallback() {
01523   if (CkMyPe() != masterPe)
01524     NAMD_bug("CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
01525   beforeForceCompute = CkWallTimer();
01526   cudaCheck(cudaSetDevice(deviceID));
01527   if (!doStreaming || doVirial || doEnergy) {
01528     cudaCheck(cudaEventRecord(forceDoneEvent, stream));
01529     forceDoneEventRecord = true;
01530   }
01531   checkCount = 0;
01532   CcdCallBacksReset(0, CmiWallTimer());
01533   // Set the call back at 0.1ms
01534   CcdCallFnAfter(forceDoneCheck, this, 0.1);
01535 }
01536 
01537 struct cr_sortop_distance {
01538   const Lattice &l;
01539   cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
01540   bool operator() (CudaComputeNonbonded::ComputeRecord i,
01541       CudaComputeNonbonded::ComputeRecord j) {
01542     Vector a = l.a();
01543     Vector b = l.b();
01544     Vector c = l.c();
01545     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01546     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01547     return ( ri < rj );
01548   }
01549 };
01550 
01551 static inline bool sortop_bitreverse(int a, int b) {
01552   if ( a == b ) return 0; 
01553   for ( int bit = 1; bit; bit *= 2 ) {
01554     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
01555   }
01556   return 0;
01557 }
01558 
01559 struct cr_sortop_reverse_priority {
01560   cr_sortop_distance &distop;
01561   const CudaComputeNonbonded::PatchRecord *pr;
01562   cr_sortop_reverse_priority(cr_sortop_distance &sod,
01563        const CudaComputeNonbonded::PatchRecord *patchrecs) : distop(sod), pr(patchrecs) { }
01564   bool pid_compare_priority(int2 pidi, int2 pidj) {
01565     const CudaComputeNonbonded::PatchRecord &pri = pr[pidi.y];
01566     const CudaComputeNonbonded::PatchRecord &prj = pr[pidj.y];
01567     if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
01568     if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
01569     if ( pri.isSameNode && ! prj.isSameNode ) return 0;
01570     if ( prj.isSameNode && ! pri.isSameNode ) return 1;
01571     if ( pri.isSameNode ) {  // and prj.isSameNode
01572       int rpri = pri.reversePriorityRankInPe;
01573       int rprj = prj.reversePriorityRankInPe;
01574       if ( rpri != rprj ) return rpri > rprj;
01575       return sortop_bitreverse(CkRankOf(pri.pe),CkRankOf(prj.pe));
01576     }
01577     int ppi = PATCH_PRIORITY(pidi.x);
01578     int ppj = PATCH_PRIORITY(pidj.x);
01579     if ( ppi != ppj ) return ppi < ppj;
01580     return pidi.x < pidj.x;
01581   }
01582   bool operator() (CudaComputeNonbonded::ComputeRecord j,
01583       CudaComputeNonbonded::ComputeRecord i) {  // i and j reversed
01584     // Choose patch i (= patch with greater priority)
01585     int2 pidi = pid_compare_priority(make_int2(i.pid[0], i.patchInd[0]), make_int2(i.pid[1], i.patchInd[1])) ? make_int2(i.pid[0], i.patchInd[0]) : make_int2(i.pid[1], i.patchInd[1]);
01586     // Choose patch j
01587     int2 pidj = pid_compare_priority(make_int2(j.pid[0], j.patchInd[0]), make_int2(j.pid[1], j.patchInd[1])) ? make_int2(j.pid[0], j.patchInd[0]) : make_int2(j.pid[1], j.patchInd[1]);
01588     if ( pidi.x != pidj.x ) return pid_compare_priority(pidi, pidj);
01589     return distop(i,j);
01590   }
01591 };
01592 
01593 //
01594 // Setup computes. This is only done at the beginning and at load balancing, hence the lack of
01595 // consideration for performance in the CPU->GPU memory copy.
01596 //
01597 void CudaComputeNonbonded::updateComputes() {
01598   cudaCheck(cudaSetDevice(deviceID));
01599 
01600   Lattice lattice = patches[0].patch->flags.lattice;
01601   cr_sortop_distance so(lattice);
01602   std::stable_sort(computes.begin(), computes.end(), so);
01603 
01604   if (doStreaming) {
01605     cr_sortop_reverse_priority sorp(so, patches.data());
01606     std::stable_sort(computes.begin(), computes.end(), sorp);
01607   }
01608 
01609   CudaComputeRecord* cudaComputes = new CudaComputeRecord[computes.size()];
01610 
01611   for (int i=0;i < computes.size();i++) {
01612     cudaComputes[i].patchInd.x = computes[i].patchInd[0];
01613     cudaComputes[i].patchInd.y = computes[i].patchInd[1];
01614     cudaComputes[i].offsetXYZ.x = computes[i].offset.x;
01615     cudaComputes[i].offsetXYZ.y = computes[i].offset.y;
01616     cudaComputes[i].offsetXYZ.z = computes[i].offset.z;
01617   }
01618 
01619   tileListKernel.updateComputes(computes.size(), cudaComputes, stream);
01620   cudaCheck(cudaStreamSynchronize(stream));
01621 
01622   delete [] cudaComputes;
01623 }
01624 
01625 struct exlist_sortop {
01626   bool operator() (int32 *li, int32 *lj) {
01627     return ( li[1] < lj[1] );
01628   }
01629 };
01630 
01631 //
01632 // Builds the exclusions table. Swiped from ComputeNonbondedCUDA.C
01633 //
01634 void CudaComputeNonbonded::buildExclusions() {
01635   cudaCheck(cudaSetDevice(deviceID));
01636 
01637   Molecule *mol = Node::Object()->molecule;
01638 
01639 #ifdef MEM_OPT_VERSION
01640   int natoms = mol->exclSigPoolSize;
01641 #else
01642   int natoms = mol->numAtoms; 
01643 #endif
01644 
01645         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
01646   exclusionsByAtom = new int2[natoms];
01647 
01648   // create unique sorted lists
01649 
01650   ObjectArena<int32> listArena;
01651   ResizeArray<int32*> unique_lists;
01652   int32 **listsByAtom = new int32*[natoms];
01653   SortableResizeArray<int32> curList;
01654   for ( int i=0; i<natoms; ++i ) {
01655     curList.resize(0);
01656     curList.add(0);  // always excluded from self
01657 #ifdef MEM_OPT_VERSION
01658     const ExclusionSignature *sig = mol->exclSigPool + i;
01659     int n = sig->fullExclCnt;
01660     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
01661     n += 1;
01662 #else
01663     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
01664     int n = mol_list[0] + 1;
01665     for ( int j=1; j<n; ++j ) {
01666       curList.add(mol_list[j] - i);
01667     }
01668 #endif
01669     curList.sort();
01670 
01671     int j;
01672     for ( j=0; j<unique_lists.size(); ++j ) {
01673       if ( n != unique_lists[j][0] ) continue;  // no match
01674       int k;
01675       for ( k=0; k<n; ++k ) {
01676         if ( unique_lists[j][k+3] != curList[k] ) break;
01677       }
01678       if ( k == n ) break;  // found match
01679     }
01680     if ( j == unique_lists.size() ) {  // no match
01681       int32 *list = listArena.getNewArray(n+3);
01682       list[0] = n;
01683       int maxdiff = 0;
01684       maxdiff = -1 * curList[0];
01685       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
01686       list[1] = maxdiff;
01687       for ( int k=0; k<n; ++k ) {
01688         list[k+3] = curList[k];
01689       }
01690       unique_lists.add(list);
01691     }
01692     listsByAtom[i] = unique_lists[j];
01693   }
01694   // sort lists by maxdiff
01695   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
01696   long int totalbits = 0;
01697   int nlists = unique_lists.size();
01698   for ( int j=0; j<nlists; ++j ) {
01699     int32 *list = unique_lists[j];
01700     int maxdiff = list[1];
01701     list[2] = totalbits + maxdiff;
01702     totalbits += 2*maxdiff + 1;
01703   }
01704   for ( int i=0; i<natoms; ++i ) {
01705     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
01706     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
01707   }
01708   delete [] listsByAtom;
01709 
01710   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
01711 
01712   {
01713     long int bytesneeded = totalbits / 8;
01714     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
01715     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
01716                 unique_lists.size(), bytesneeded);
01717     }
01718 
01719     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
01720     if ( bytesneeded > bytesavail ) {
01721       char errmsg[512];
01722       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
01723                      "but only %ld bytes can be addressed with 32-bit int.",
01724                      unique_lists.size(), bytesneeded, bytesavail);
01725       NAMD_die(errmsg);
01726     }
01727   }
01728 
01729 #define SET_EXCL(EXCL,BASE,DIFF) \
01730          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
01731 
01732   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
01733   memset(exclusion_bits, 0, totalbits/8);
01734 
01735   long int base = 0;
01736   for ( int i=0; i<unique_lists.size(); ++i ) {
01737     base += unique_lists[i][1];
01738     if ( unique_lists[i][2] != (int32)base ) {
01739       NAMD_bug("CudaComputeNonbonded::build_exclusions base != stored");
01740     }
01741     int n = unique_lists[i][0];
01742     for ( int j=0; j<n; ++j ) {
01743       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
01744     }
01745     base += unique_lists[i][1] + 1;
01746   }
01747 
01748   int numExclusions = totalbits/32;
01749 
01750         nonbondedKernel.bindExclusions(numExclusions, exclusion_bits);
01751 
01752   delete [] exclusion_bits;
01753 }
01754 
01755 #endif // NAMD_CUDA

Generated on Tue Sep 19 01:17:12 2017 for NAMD by  doxygen 1.4.7