CudaComputeNonbonded.C

Go to the documentation of this file.
00001 #include <algorithm>
00002 #include <map>
00003 #include <vector>
00004 #include "NamdTypes.h"
00005 #include "charm++.h"
00006 #include "Patch.h"
00007 #include "PatchMap.h"
00008 #include "ProxyMgr.h"
00009 #include "LJTable.h"
00010 #include "Node.h"
00011 #include "ObjectArena.h"
00012 // #include "ComputeCUDAMgr.h"
00013 #include "ReductionMgr.h"
00014 #include "CudaComputeNonbonded.h"
00015 #include "WorkDistrib.h"
00016 #include "HomePatch.h"
00017 #include "Priorities.h"
00018 #include "ComputePmeCUDAMgr.h"
00019 //#include "CudaUtils.h"
00020 
00021 #include "DeviceCUDA.h"
00022 #ifdef NAMD_CUDA
00023 #ifdef WIN32
00024 #define __thread __declspec(thread)
00025 #endif
00026 extern __thread DeviceCUDA *deviceCUDA;
00027 #endif
00028 
00029 #ifdef NAMD_CUDA
00030 
00031 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime);  // fix Charm++
00032 
00033 //
00034 // Class constructor
00035 //
00036 CudaComputeNonbonded::CudaComputeNonbonded(ComputeID c, int deviceID,
00037   CudaNonbondedTables& cudaNonbondedTables, bool doStreaming) : 
00038 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
00039 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
00040 
00041   cudaCheck(cudaSetDevice(deviceID));
00042 
00043         exclusionsByAtom = NULL;
00044 
00045   vdwTypes = NULL;
00046   vdwTypesSize = 0;
00047 
00048   exclIndexMaxDiff = NULL;
00049   exclIndexMaxDiffSize = 0;
00050 
00051   atomIndex = NULL;
00052   atomIndexSize = 0;
00053 
00054   atomStorageSize = 0;
00055 
00056   // Atom and charge storage
00057   atoms = NULL;
00058   atomsSize = 0;
00059 
00060   // Force storage
00061   h_forces = NULL;
00062   h_forcesSize = 0;
00063   h_forcesSlow = NULL;
00064   h_forcesSlowSize = 0;
00065 
00066   d_forces = NULL;
00067   d_forcesSize = 0;
00068   d_forcesSlow = NULL;
00069   d_forcesSlowSize = 0;
00070 
00071   // GBIS
00072   intRad0H = NULL;
00073   intRad0HSize = 0;
00074   intRadSH = NULL;
00075   intRadSHSize = 0;
00076   psiSumH = NULL;
00077   psiSumHSize = 0;
00078   bornRadH = NULL;
00079   bornRadHSize = 0;
00080   dEdaSumH = NULL;
00081   dEdaSumHSize = 0;
00082   dHdrPrefixH = NULL;
00083   dHdrPrefixHSize = 0;
00084 
00085   cudaPatches = NULL;
00086 
00087   atomsChangedIn = true;
00088   atomsChanged = true;
00089   computesChanged = true;
00090 
00091   forceDoneEventRecord = false;
00092 
00093   SimParameters *simParams = Node::Object()->simParameters;
00094   if (simParams->pressureProfileOn) {
00095     NAMD_die("CudaComputeNonbonded, pressure profile not supported");
00096   }
00097 
00098   if (simParams->GBISOn) gbisPhase = 3;
00099 
00100   doSkip = false;
00101 }
00102 
00103 //
00104 // Class destructor
00105 //
00106 CudaComputeNonbonded::~CudaComputeNonbonded() {
00107   cudaCheck(cudaSetDevice(deviceID));
00108         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
00109   if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
00110   if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
00111   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
00112   if (h_forces != NULL) deallocate_host<float4>(&h_forces);
00113   if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
00114   if (d_forces != NULL) deallocate_device<float4>(&d_forces);
00115   if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
00116 
00117   // GBIS
00118   if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
00119   if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
00120   if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
00121   if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
00122   if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
00123   if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
00124 
00125   if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
00126 
00127   if (patches.size() > 0) {
00128     deallocate_host<VirialEnergy>(&h_virialEnergy);
00129     deallocate_device<VirialEnergy>(&d_virialEnergy);
00130     cudaCheck(cudaStreamDestroy(stream));
00131     cudaCheck(cudaEventDestroy(forceDoneEvent));
00132     CmiDestroyLock(lock);
00133     delete reduction;
00134   }
00135 
00136   // NOTE: unregistering happens in [sync] -entry method
00137   computeMgr->sendUnregisterBoxesOnPe(pes, this);
00138 
00139 }
00140 
00141 void CudaComputeNonbonded::unregisterBox(int i) {
00142   if (patches[i].positionBox != NULL) patches[i].patch->unregisterPositionPickup(this, &patches[i].positionBox);
00143   if (patches[i].forceBox != NULL) patches[i].patch->unregisterForceDeposit(this, &patches[i].forceBox);
00144   if (patches[i].intRadBox != NULL) patches[i].patch->unregisterIntRadPickup(this, &patches[i].intRadBox);
00145   if (patches[i].psiSumBox != NULL) patches[i].patch->unregisterPsiSumDeposit(this, &patches[i].psiSumBox);
00146   if (patches[i].bornRadBox != NULL) patches[i].patch->unregisterBornRadPickup(this, &patches[i].bornRadBox);
00147   if (patches[i].dEdaSumBox != NULL) patches[i].patch->unregisterDEdaSumDeposit(this, &patches[i].dEdaSumBox);
00148   if (patches[i].dHdrPrefixBox != NULL) patches[i].patch->unregisterDHdrPrefixPickup(this, &patches[i].dHdrPrefixBox);
00149 }
00150 
00151 void CudaComputeNonbonded::unregisterBoxesOnPe() {
00152   if (rankPatches[CkMyRank()].size() == 0)
00153     NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
00154   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00155     unregisterBox(rankPatches[CkMyRank()][i]);
00156   }
00157 }
00158 
00159 //
00160 // Register inter-patch (self) compute.
00161 // Only serialized calls allowed
00162 //
00163 void CudaComputeNonbonded::registerComputeSelf(ComputeID cid, PatchID pid) {
00164   computesChanged = true;
00165   addPatch(pid);
00166   addCompute(cid, pid, pid, 0.);
00167 }
00168 
00169 //
00170 // Register pair-patch compute.
00171 // Only serialized calls allowed
00172 //
00173 void CudaComputeNonbonded::registerComputePair(ComputeID cid, PatchID* pid, int* trans) {
00174   computesChanged = true;
00175   addPatch(pid[0]);
00176   addPatch(pid[1]);
00177   PatchMap* patchMap = PatchMap::Object();
00178   int t1 = trans[0];
00179   int t2 = trans[1];
00180   Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
00181   offset.x += (t1%3-1) - (t2%3-1);
00182   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00183   offset.z += (t1/9-1) - (t2/9-1);
00184   addCompute(cid, pid[0], pid[1], offset);
00185 }
00186 
00187 //
00188 // Add patch
00189 //
00190 void CudaComputeNonbonded::addPatch(PatchID pid) {
00191   patches.push_back(PatchRecord(pid));
00192 }
00193 
00194 //
00195 // Add compute
00196 //
00197 void CudaComputeNonbonded::addCompute(ComputeID cid, PatchID pid1, PatchID pid2, Vector offset) {
00198   ComputeRecord cr;
00199   cr.cid = cid;
00200   cr.pid[0] = pid1;
00201   cr.pid[1] = pid2;
00202   cr.offset = offset;
00203   computes.push_back(cr);
00204 }
00205 
00206 //
00207 // Update numAtoms and numFreeAtoms on a patch
00208 //
00209 void CudaComputeNonbonded::updatePatch(int i) {
00210   int numAtoms = patches[i].patch->getNumAtoms();
00211   int numFreeAtoms = numAtoms;
00212   if ( fixedAtomsOn ) {
00213     const CompAtomExt *aExt = patches[i].patch->getCompAtomExtInfo();
00214     for ( int j=0; j< numAtoms; ++j ) {
00215       if ( aExt[j].atomFixed ) --numFreeAtoms;
00216     }
00217   }
00218   patches[i].numAtoms = numAtoms;
00219   patches[i].numFreeAtoms = numFreeAtoms;
00220   cudaPatches[i].numAtoms = numAtoms;
00221   cudaPatches[i].numFreeAtoms = numFreeAtoms;
00222 }
00223 
00224 int CudaComputeNonbonded::findPid(PatchID pid) {
00225   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00226     int j = rankPatches[CkMyRank()][i];
00227     if (patches[j].patchID == pid) return j;
00228   }
00229   return -1;
00230 }
00231 
00232 void CudaComputeNonbonded::patchReady(PatchID pid, int doneMigration, int seq) {
00233   if (doneMigration) {
00234     int i = findPid(pid);
00235     if (i == -1)
00236       NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
00237     updatePatch(i);
00238   }
00239   CmiLock(lock);
00240   Compute::patchReady(pid, doneMigration, seq);
00241   CmiUnlock(lock);
00242 }
00243 
00244 void CudaComputeNonbonded::gbisP2PatchReady(PatchID pid, int seq) {
00245   CmiLock(lock);
00246   Compute::gbisP2PatchReady(pid, seq);
00247   CmiUnlock(lock);
00248 }
00249 
00250 void CudaComputeNonbonded::gbisP3PatchReady(PatchID pid, int seq) {
00251   CmiLock(lock);
00252   Compute::gbisP3PatchReady(pid, seq);
00253   CmiUnlock(lock);
00254 }
00255 
00256 void CudaComputeNonbonded::assignPatch(int i) {
00257   PatchMap* patchMap = PatchMap::Object();
00258   PatchID pid = patches[i].patchID;
00259   Patch* patch = patchMap->patch(pid);
00260   if (patch == NULL) {
00261     // Create ProxyPatch if none exists
00262     ProxyMgr::Object()->createProxy(pid);
00263     patch = patchMap->patch(pid);
00264   }
00265   patches[i].patch = patch;
00266   if (patches[i].patch == NULL) {
00267     NAMD_bug("CudaComputeNonbonded::assignPatch, patch not found");
00268   }
00269   patches[i].positionBox = patches[i].patch->registerPositionPickup(this);
00270   patches[i].forceBox    = patches[i].patch->registerForceDeposit(this);
00271   SimParameters *simParams = Node::Object()->simParameters;
00272   if (simParams->GBISOn) {
00273     patches[i].intRadBox     = patches[i].patch->registerIntRadPickup(this);
00274     patches[i].psiSumBox     = patches[i].patch->registerPsiSumDeposit(this);
00275     patches[i].bornRadBox    = patches[i].patch->registerBornRadPickup(this);
00276     patches[i].dEdaSumBox    = patches[i].patch->registerDEdaSumDeposit(this);
00277     patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this);
00278   }
00279   // Store Pe where this patch was registered
00280 #if 1
00281   if (patches[i].pe != CkMyPe()) {
00282     NAMD_bug("CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe");
00283   }
00284 #else
00285   patches[i].pe = CkMyPe();
00286 #endif
00287   //
00288   patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00289   patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00290 }
00291 
00292 struct pid_sortop_reverse_priority {
00293   bool operator() (int2 pidj, int2 pidi) {  // i and j reversed
00294     int ppi = PATCH_PRIORITY(pidi.x);
00295     int ppj = PATCH_PRIORITY(pidj.x);
00296     if ( ppi != ppj ) return ppi < ppj;
00297     return pidi.x < pidj.x;
00298   }
00299 };
00300 
00301 void CudaComputeNonbonded::assignPatchesOnPe() {
00302   if (rankPatches[CkMyRank()].size() == 0)
00303     NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
00304 
00305   // calculate priority rank of local home patch within pe
00306   {
00307     PatchMap* patchMap = PatchMap::Object();
00308     ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
00309     for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
00310       int i = rankPatches[CkMyRank()][k];
00311       int pid = patches[i].patchID;
00312       int homePe = patchMap->node(pid);
00313       if ( CkNodeOf(homePe) == CkMyNode() ) {
00314         int2 pid_index;
00315         pid_index.x = pid;
00316         pid_index.y = i;
00317         homePatchByRank[CkRankOf(homePe)].add(pid_index);
00318       }
00319     }
00320     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00321       pid_sortop_reverse_priority so;
00322       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00323       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00324       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00325         int index = homePatchByRank[i][j].y;
00326         patches[index].reversePriorityRankInPe = j + masterBoost;
00327       }
00328     }
00329   }
00330 
00331   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00332     assignPatch(rankPatches[CkMyRank()][i]);
00333   }
00334 }
00335 
00336 //
00337 // Returns Pe of Patch ID "pid", -1 otherwise
00338 //
00339 // int findHomePatchPe(std::vector<PatchIDList>& rankPatchIDs, PatchID pid) {
00340 int findHomePatchPe(PatchIDList* rankPatchIDs, PatchID pid) {
00341   // for (int i=0;i < rankPatchIDs.size();i++) {
00342   for (int i=0;i < CkMyNodeSize();i++) {
00343     if (rankPatchIDs[i].find(pid) != -1) return CkNodeFirst(CkMyNode()) + i;
00344   }
00345   return -1;
00346 }
00347 
00348 //
00349 // Find all PEs that have Patch
00350 //
00351 void findProxyPatchPes(std::vector<int>& proxyPatchPes, PatchID pid) {
00352   proxyPatchPes.clear();
00353   for (int i=0;i < CkMyNodeSize();i++) {
00354     int pe = CkNodeFirst(CkMyNode()) + i;
00355     if (PatchMap::ObjectOnPe(pe)->patch(pid) != NULL)
00356       proxyPatchPes.push_back(pe);
00357   }
00358 }
00359 
00360 //
00361 // Called after all computes have been registered
00362 //
00363 void CudaComputeNonbonded::assignPatches(ComputeMgr* computeMgrIn) {
00364   // Remove duplicate patches
00365   std::sort(patches.begin(), patches.end());
00366   std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
00367   patches.erase(last, patches.end());
00368   // Set number of patches
00369   setNumPatches(patches.size());
00370   masterPe = CkMyPe();
00371   computeMgr = computeMgrIn;
00372   // Start patch counter
00373   patchesCounter = getNumPatches();
00374   // Patch ID map
00375   std::map<PatchID, int> pidMap;
00376 #if 1
00377   //-------------------------------------------------------
00378   // Copied in from ComputeNonbondedCUDA::assignPatches()
00379   //-------------------------------------------------------
00380 
00381   std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
00382   int numPesOnNodeSharingDevice = 0;
00383   int masterIndex = -1;
00384   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00385     int pe = deviceCUDA->getPesSharingDevice(i);
00386     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00387     if ( CkNodeOf(pe) == CkMyNode() ) {
00388       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00389     }
00390   }
00391 
00392   std::vector<int> count(patches.size(), 0);
00393   std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
00394   std::vector<int> rankpcount(CkMyNodeSize(), 0);
00395   std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
00396 
00397   PatchMap* patchMap = PatchMap::Object();
00398 
00399   int unassignedpatches = patches.size();
00400 
00401   for (int i=0;i < patches.size(); ++i) {
00402     patches[i].pe = -1;
00403   }
00404 
00405   // assign if home pe and build table of natural proxies
00406   for (int i=0;i < patches.size(); ++i) {
00407     int pid = patches[i].patchID;
00408     // homePe = PE where the patch currently resides
00409     int homePe = patchMap->node(pid);
00410     for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
00411       int pe = pesOnNodeSharingDevice[j];
00412       // If homePe is sharing this device, assign this patch to homePe
00413       if ( pe == homePe ) {
00414         patches[i].pe = pe;
00415         --unassignedpatches;
00416         pcount[j] += 1;
00417       }
00418       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00419         table[i*numPesOnNodeSharingDevice + j] = 1;
00420       }
00421     }
00422     // Assign this patch to homePe, if it resides on the same node
00423     if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00424       patches[i].pe = homePe;
00425       --unassignedpatches;
00426       rankpcount[CkRankOf(homePe)] += 1;
00427     }
00428   }
00429   // assign if only one pe has a required proxy
00430   for (int i=0; i < patches.size(); ++i) {
00431     int pid = patches[i].patchID;
00432     if ( patches[i].pe != -1 ) continue;
00433     int c = 0;
00434     int lastj;
00435     for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
00436       if ( table[i*numPesOnNodeSharingDevice + j] ) {
00437         ++c;
00438         lastj = j;
00439       }
00440     }
00441     count[i] = c;
00442     if ( c == 1 ) {
00443       patches[i].pe = pesOnNodeSharingDevice[lastj];
00444       --unassignedpatches;
00445       pcount[lastj] += 1;
00446     }
00447   }
00448   int assignj = 0;
00449   while ( unassignedpatches ) {
00450     int i;
00451     for (i=0;i < patches.size(); ++i) {
00452       if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
00453       int pid = patches[i].patchID;
00454       // patch_record &pr = patchRecords[pid];
00455       if ( patches[i].pe != -1 ) continue;
00456       patches[i].pe = pesOnNodeSharingDevice[assignj];
00457       --unassignedpatches;
00458       pcount[assignj] += 1;
00459       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00460       break;
00461     }
00462     if (i < patches.size() ) continue;  // start search again
00463     for ( i=0;i < patches.size(); ++i ) {
00464       int pid = patches[i].patchID;
00465       // patch_record &pr = patchRecords[pid];
00466       if ( patches[i].pe != -1 ) continue;
00467       if ( count[i] ) continue;
00468       patches[i].pe = pesOnNodeSharingDevice[assignj];
00469       --unassignedpatches;
00470       pcount[assignj] += 1;
00471       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00472       break;
00473     }
00474     if ( i < patches.size() ) continue;  // start search again
00475     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00476   }
00477 
00478   // For each rank, list of patches
00479   rankPatches.resize(CkMyNodeSize());
00480   for (int i=0; i < patches.size(); ++i) {
00481     rankPatches[CkRankOf(patches[i].pe)].push_back(i);
00482     pidMap[patches[i].patchID] = i;
00483   }
00484 
00485   // for ( int i=0; i < patches.size(); ++i ) {
00486   //   CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
00487   // }
00488 
00489 /*
00490   slavePes = new int[CkMyNodeSize()];
00491   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00492   numSlaves = 0;
00493   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00494     int pe = pesOnNodeSharingDevice[j];
00495     int rank = pe - CkNodeFirst(CkMyNode());
00496     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00497     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00498     if ( pe == CkMyPe() ) continue;
00499     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00500     rankpcount[rank] = 0;  // skip in rank loop below
00501     slavePes[numSlaves] = pe;
00502     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00503     ++numSlaves;
00504   }
00505   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00506     int pe = CkNodeFirst(CkMyNode()) + j;
00507     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00508     //          CkMyPe(),j,pe,rankpcount[j]);
00509     if ( ! rankpcount[j] ) continue;
00510     if ( pe == CkMyPe() ) continue;
00511     slavePes[numSlaves] = pe;
00512     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00513     ++numSlaves;
00514   }
00515 */
00516 
00517 #else
00518   // For each rank, list of patches
00519   rankPatches.resize(CkMyNodeSize());
00520   // For each rank, list of home patch IDs
00521   PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
00522   for (int i=0;i < CkMyNodeSize();i++) {
00523     int pe = CkNodeFirst(CkMyNode()) + i;
00524     PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
00525   }
00526   std::vector<int> proxyPatchPes;
00527   std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
00528   //--------------------------------------------------------
00529   // Build a list of PEs to avoid
00530   std::vector<int> pesToAvoid;
00531 #if 0
00532   // Avoid other GPUs' master PEs
00533   for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
00534     int pe = deviceCUDA->getMasterPeForDeviceID(i);
00535     if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
00536   }
00537   // Avoid PEs that are involved in PME
00538   ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
00539   for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
00540     if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
00541   }
00542   // Set counters of avoidable PEs to high numbers
00543   for (int i=0;i < pesToAvoid.size();i++) {
00544     int pe = pesToAvoid[i];
00545     peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);    
00546   }
00547 #endif
00548   // Avoid master Pe somewhat
00549   peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
00550   //--------------------------------------------------------
00551   for (int i=0;i < patches.size();i++) {
00552     PatchID pid = patches[i].patchID;
00553     int pe = findHomePatchPe(rankHomePatchIDs, pid);
00554     if (pe == -1) {
00555       // Patch not present on this node => try finding a ProxyPatch
00556       findProxyPatchPes(proxyPatchPes, pid);
00557       if (proxyPatchPes.size() == 0) {
00558         // No ProxyPatch => create one on rank that has the least ProxyPatches
00559         int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00560         pe = CkNodeFirst(CkMyNode()) + rank;
00561         peProxyPatchCounter[rank]++;
00562       } else {
00563         // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
00564         // this is done by finding the entry with minimum peProxyPatchCounter -value
00565         // Find miniumum among proxyPatchPes, i.e., find the minimum among
00566         // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
00567         // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
00568         //   [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
00569         //   - proxyPatchPes.begin();
00570         // pe = proxyPatchPes[pppi];
00571         int minCounter = (1 << 30);
00572         for (int j=0;j < proxyPatchPes.size();j++) {
00573           if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
00574             pe = proxyPatchPes[j];
00575             minCounter = peProxyPatchCounter[CkRankOf(pe)];
00576           }
00577         }
00578         if (pe == -1)
00579           NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
00580         peProxyPatchCounter[CkRankOf(pe)]++;
00581       }
00582     } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
00583       // Found home patch on this node, but it's on PE that should be avoided => find a new one
00584       int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
00585       pe = CkNodeFirst(CkMyNode()) + rank;
00586       peProxyPatchCounter[rank]++;
00587     }
00588     if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
00589       NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
00590     rankPatches[CkRankOf(pe)].push_back(i);
00591     pidMap[pid] = i;
00592   }
00593 
00594   delete [] rankHomePatchIDs;
00595 #endif
00596   // Setup computes using pidMap
00597   for (int i=0;i < computes.size();i++) {
00598     computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
00599     computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
00600   }
00601   for (int i=0;i < CkMyNodeSize();i++) {
00602     if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
00603   }
00604   computeMgr->sendAssignPatchesOnPe(pes, this);
00605 }
00606 
00607 void CudaComputeNonbonded::initialize() {
00608   if (patches.size() > 0) {
00609     // Allocate CUDA version of patches
00610     cudaCheck(cudaSetDevice(deviceID));
00611     allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
00612 
00613     allocate_host<VirialEnergy>(&h_virialEnergy, 1);
00614     allocate_device<VirialEnergy>(&d_virialEnergy, 1);
00615 
00616 #if CUDA_VERSION >= 5050
00617     int leastPriority, greatestPriority;
00618     cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
00619     int priority = (doStreaming) ? leastPriority : greatestPriority;
00620     // int priority = greatestPriority;
00621     cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
00622 #else
00623     cudaCheck(cudaStreamCreate(&stream));
00624 #endif
00625     cudaCheck(cudaEventCreate(&forceDoneEvent));
00626 
00627     buildExclusions();
00628 
00629     lock = CmiCreateLock();
00630 
00631     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00632   }  
00633 }
00634 
00635 //
00636 // atomUpdate() can be called by any Pe
00637 //
00638 void CudaComputeNonbonded::atomUpdate() {
00639   atomsChangedIn = true;
00640 }
00641 
00642 //
00643 // Compute patches[].atomStart, patches[].numAtoms, patches[].numFreeAtoms, and atomStorageSize
00644 //
00645 void CudaComputeNonbonded::updatePatches() {
00646 
00647   // Maximum number of tiles per tile list
00648   maxTileListLen = 0;
00649   int atomStart = 0;
00650   for (int i=0;i < patches.size();i++) {
00651     patches[i].atomStart = atomStart;
00652     cudaPatches[i].atomStart = atomStart;
00653     int numAtoms = patches[i].numAtoms;
00654     int numTiles = ((numAtoms-1)/WARPSIZE+1);
00655     maxTileListLen = std::max(maxTileListLen, numTiles);
00656     atomStart += numTiles*WARPSIZE;
00657   }
00658   atomStorageSize = atomStart;
00659 
00660   if (maxTileListLen >= 65536) {
00661     NAMD_bug("CudaComputeNonbonded::updatePatches, maximum number of tiles per tile lists (65536) blown");
00662   }
00663 }
00664 
00665 void CudaComputeNonbonded::skipPatch(int i) {
00666   if (CkMyPe() != patches[i].pe)
00667     NAMD_bug("CudaComputeNonbonded::skipPatch called on wrong Pe");
00668   Flags &flags = patches[i].patch->flags;
00669   patches[i].positionBox->skip();
00670   patches[i].forceBox->skip();
00671   if (flags.doGBIS) {
00672     patches[i].psiSumBox->skip();
00673     patches[i].intRadBox->skip();
00674     patches[i].bornRadBox->skip();
00675     patches[i].dEdaSumBox->skip();
00676     patches[i].dHdrPrefixBox->skip();
00677   }
00678 }
00679 
00680 void CudaComputeNonbonded::skipPatchesOnPe() {
00681   if (rankPatches[CkMyRank()].size() == 0)
00682     NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
00683   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00684     skipPatch(rankPatches[CkMyRank()][i]);
00685   }
00686   bool done = false;
00687   CmiLock(lock);
00688   patchesCounter -= rankPatches[CkMyRank()].size();
00689   if (patchesCounter == 0) {
00690     patchesCounter = getNumPatches();
00691     done = true;
00692   }
00693   CmiUnlock(lock);
00694   if (done) {
00695     // Reduction must be done on masterPe
00696     computeMgr->sendFinishReductions(masterPe, this);
00697   }
00698 }
00699 
00700 void CudaComputeNonbonded::skip() {
00701   if (CkMyPe() != masterPe)
00702     NAMD_bug("CudaComputeNonbonded::skip() called on non masterPe");
00703 
00704   if (patches.size() == 0) return;
00705 
00706   doSkip = true;
00707 
00708   computeMgr->sendSkipPatchesOnPe(pes, this);
00709 }
00710 
00711 void CudaComputeNonbonded::getMaxMovementTolerance(float& maxAtomMovement, float& maxPatchTolerance) {
00712   if (CkMyPe() != masterPe)
00713     NAMD_bug("CudaComputeNonbonded::getMaxMovementTolerance() called on non masterPe");
00714 
00715   for (int i=0;i < patches.size();++i) {
00716     PatchRecord &pr = patches[i];
00717 
00718     float maxMove = pr.patch->flags.maxAtomMovement;
00719     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
00720 
00721     float maxTol = pr.patch->flags.pairlistTolerance;
00722     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
00723   }
00724 }
00725 
00726 inline void CudaComputeNonbonded::updateVdwTypesExclLoop(int first, int last, void *result, int paraNum, void *param) {
00727   CudaComputeNonbonded* c = (CudaComputeNonbonded *)param;
00728   c->updateVdwTypesExclSubset(first, last);
00729 }
00730 
00731 void CudaComputeNonbonded::updateVdwTypesExclSubset(int first, int last) {
00732   for (int i=first;i <= last;i++) {
00733     PatchRecord &pr = patches[i];
00734     int start = pr.atomStart;
00735     int numAtoms = pr.numAtoms;
00736     const CompAtom *compAtom = pr.compAtom;
00737     const CompAtomExt *compAtomExt = pr.patch->getCompAtomExtInfo();
00738     // Atoms have changed, re-do exclusions and vdw types
00739     int2* exclp = exclIndexMaxDiff + start;
00740     int* aip = atomIndex + start;
00741     for ( int k=0;k < numAtoms; ++k ) {
00742       int j = compAtomExt[k].sortOrder;
00743       vdwTypes[start + k] = compAtom[j].vdwType;
00744       aip[k] = compAtomExt[j].id;
00745 #ifdef MEM_OPT_VERSION
00746       exclp[k].x = exclusionsByAtom[compAtomExt[j].exclId].y;
00747       exclp[k].y = exclusionsByAtom[compAtomExt[j].exclId].x;
00748 #else // ! MEM_OPT_VERSION
00749       exclp[k].x = exclusionsByAtom[compAtomExt[j].id].y;
00750       exclp[k].y = exclusionsByAtom[compAtomExt[j].id].x;
00751 #endif // MEM_OPT_VERSION
00752     }
00753   }
00754 }
00755 
00756 //
00757 // Called every time atoms changed
00758 //
00759 void CudaComputeNonbonded::updateVdwTypesExcl() {
00760   // Re-allocate (VdwTypes, exclIndexMaxDiff) as needed
00761   reallocate_host<int>(&vdwTypes, &vdwTypesSize, atomStorageSize, 1.4f);
00762   reallocate_host<int2>(&exclIndexMaxDiff, &exclIndexMaxDiffSize, atomStorageSize, 1.4f);
00763   reallocate_host<int>(&atomIndex, &atomIndexSize, atomStorageSize, 1.4f);
00764 
00765 #if CMK_SMP && USE_CKLOOP
00766   int useCkLoop = Node::Object()->simParameters->useCkLoop;
00767   if (useCkLoop >= 1) {
00768     CkLoop_Parallelize(updateVdwTypesExclLoop, 1, (void *)this, CkMyNodeSize(), 0, patches.size()-1);
00769   } else
00770 #endif
00771   {
00772     updateVdwTypesExclSubset(0, patches.size()-1);
00773   }
00774 
00775   nonbondedKernel.updateVdwTypesExcl(atomStorageSize, vdwTypes, exclIndexMaxDiff, atomIndex, stream);
00776 }
00777 
00778 inline void CudaComputeNonbonded::copyAtomsLoop(int first, int last, void *result, int paraNum, void *param) {
00779   CudaComputeNonbonded* c = (CudaComputeNonbonded *)param;
00780   c->copyAtomsSubset(first, last);
00781 }
00782 
00783 void CudaComputeNonbonded::copyAtomsSubset(int first, int last) {
00784   for (int i=first;i <= last;++i) {
00785     PatchRecord &pr = patches[i];
00786     int numAtoms = pr.numAtoms;
00787     if (numAtoms > 0) {
00788       int start = pr.atomStart;
00789       const CudaAtom *src = pr.patch->getCudaAtomList();
00790       CudaAtom *dst = atoms + start;
00791       memcpy(dst, src, sizeof(CudaAtom)*numAtoms);
00792       // Fill the rest with the copy of the last atom
00793       int numAtomsAlign = ((numAtoms-1)/32+1)*32;
00794       CudaAtom lastAtom = src[numAtoms-1];
00795       for (int j=numAtoms;j < numAtomsAlign;j++) {
00796         dst[j] = lastAtom;
00797       }
00798     }
00799   }
00800 }
00801 
00802 void CudaComputeNonbonded::copyGBISphase(int i) {
00803   if (CkMyPe() != patches[i].pe)
00804     NAMD_bug("CudaComputeNonbonded::copyGBISphase called on wrong Pe");
00805   PatchRecord &pr = patches[i];
00806   const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
00807   if (gbisPhase == 1) {
00808     //Copy GBIS intRadius to Host
00809     if (atomsChanged) {
00810       float *intRad0 = intRad0H + pr.atomStart;
00811       float *intRadS = intRadSH + pr.atomStart;
00812       for (int k=0;k < pr.numAtoms;++k) {
00813         int j = aExt[k].sortOrder;
00814         intRad0[k] = pr.intRad[2*j+0];
00815         intRadS[k] = pr.intRad[2*j+1];
00816       }
00817     }
00818   } else if (gbisPhase == 2) {
00819     float *bornRad = bornRadH + pr.atomStart;
00820     for ( int k=0; k < pr.numAtoms; ++k ) {
00821       int j = aExt[k].sortOrder;
00822       bornRad[k] = pr.bornRad[j];
00823     }
00824   } else if (gbisPhase == 3) {
00825     float *dHdrPrefix = dHdrPrefixH + pr.atomStart;
00826     for ( int k=0; k < pr.numAtoms; ++k ) {
00827       int j = aExt[k].sortOrder;
00828       dHdrPrefix[k] = pr.dHdrPrefix[j];
00829     }
00830   } // end phases
00831 }
00832 
00833 void CudaComputeNonbonded::openBox(int i) {
00834   if (CkMyPe() != patches[i].pe)
00835     NAMD_bug("CudaComputeNonbonded::openBox called on wrong Pe");
00836   SimParameters *simParams = Node::Object()->simParameters;
00837   if (!simParams->GBISOn || gbisPhase == 1) {
00838     patches[i].compAtom = patches[i].positionBox->open();
00839     copyAtomsSubset(i, i);
00840   }
00841   if (simParams->GBISOn) {
00842     if (gbisPhase == 1) {
00843       patches[i].intRad     = patches[i].intRadBox->open();
00844       patches[i].psiSum     = patches[i].psiSumBox->open();
00845     } else if (gbisPhase == 2) {
00846       patches[i].bornRad    = patches[i].bornRadBox->open();
00847       patches[i].dEdaSum    = patches[i].dEdaSumBox->open();
00848     } else if (gbisPhase == 3) {
00849       patches[i].dHdrPrefix = patches[i].dHdrPrefixBox->open();
00850     }
00851     copyGBISphase(i);
00852   }
00853 }
00854 
00855 void CudaComputeNonbonded::messageEnqueueWork() {
00856   if (masterPe != CkMyPe())
00857     NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
00858   WorkDistrib::messageEnqueueWork(this);
00859 }
00860 
00861 void CudaComputeNonbonded::openBoxesOnPe() {
00862   if (rankPatches[CkMyRank()].size() == 0)
00863     NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
00864   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
00865     openBox(rankPatches[CkMyRank()][i]);
00866   }
00867   bool done = false;
00868   CmiLock(lock);
00869   patchesCounter -= rankPatches[CkMyRank()].size();
00870   if (patchesCounter == 0) {
00871     patchesCounter = getNumPatches();
00872     done = true;
00873   }
00874   CmiUnlock(lock);
00875   if (done) {
00876     computeMgr->sendLaunchWork(masterPe, this);
00877   }
00878 }
00879 
00880 int CudaComputeNonbonded::noWork() {
00881   // Simply enqueu doWork on masterPe and return "no work"
00882   computeMgr->sendMessageEnqueueWork(masterPe, this);
00883   return 1;
00884 }
00885 
00886 void CudaComputeNonbonded::reallocateArrays() {
00887   cudaCheck(cudaSetDevice(deviceID));
00888   SimParameters *simParams = Node::Object()->simParameters;
00889 
00890   // Re-allocate atoms
00891   reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
00892 
00893   // Re-allocate forces
00894   if (doStreaming) {
00895     reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
00896     reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f, cudaHostAllocMapped);
00897   } else {
00898     reallocate_host<float4>(&h_forces, &h_forcesSize, atomStorageSize, 1.4f);
00899     reallocate_host<float4>(&h_forcesSlow, &h_forcesSlowSize, atomStorageSize, 1.4f);
00900   }
00901   reallocate_device<float4>(&d_forces, &d_forcesSize, atomStorageSize, 1.4f);
00902   reallocate_device<float4>(&d_forcesSlow, &d_forcesSlowSize, atomStorageSize, 1.4f);  
00903 
00904   if (simParams->GBISOn) {
00905     reallocate_host<float>(&intRad0H, &intRad0HSize, atomStorageSize, 1.2f);
00906     reallocate_host<float>(&intRadSH, &intRadSHSize, atomStorageSize, 1.2f);
00907     reallocate_host<GBReal>(&psiSumH, &psiSumHSize, atomStorageSize, 1.2f);
00908     reallocate_host<GBReal>(&dEdaSumH, &dEdaSumHSize, atomStorageSize, 1.2f);
00909     reallocate_host<float>(&bornRadH, &bornRadHSize, atomStorageSize, 1.2f);
00910     reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixHSize, atomStorageSize, 1.2f);
00911   }
00912 }
00913 
00914 void CudaComputeNonbonded::doWork() {
00915   if (CkMyPe() != masterPe)
00916     NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
00917 
00918   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
00919   // atomsChangedIn can be set to true by any Pe
00920   // atomsChanged can only be set by masterPe
00921   // This use of double varibles makes sure we don't have race condition
00922   atomsChanged = atomsChangedIn;
00923   atomsChangedIn = false;
00924 
00925   SimParameters *simParams = Node::Object()->simParameters;
00926 
00927   if (patches.size() == 0) return;  // No work do to
00928 
00929   // Take the flags from the first patch on this Pe
00930   // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
00931   Flags &flags = patches[0].patch->flags;
00932 
00933   doSlow = flags.doFullElectrostatics;
00934   doEnergy = flags.doEnergy;
00935   doVirial = flags.doVirial;
00936 
00937   if (flags.doNonbonded) {
00938 
00939     if (simParams->GBISOn) {
00940       gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
00941     }
00942 
00943     if (!simParams->GBISOn || gbisPhase == 1) {
00944       if ( computesChanged ) {
00945         updateComputes();
00946       }
00947       if (atomsChanged) {
00948         // Re-calculate patch atom numbers and storage
00949         updatePatches();
00950         reSortDone = false;
00951       }
00952       reallocateArrays();
00953     }
00954 
00955     // Open boxes on Pes and launch work to masterPe
00956     computeMgr->sendOpenBoxesOnPe(pes, this);
00957 
00958   } else {
00959     // No work to do, skip
00960     skip();
00961   }
00962 
00963 }
00964 
00965 void CudaComputeNonbonded::launchWork() {
00966   if (CkMyPe() != masterPe)
00967     NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
00968 
00969   beforeForceCompute = CkWallTimer();
00970 
00971   cudaCheck(cudaSetDevice(deviceID));
00972   SimParameters *simParams = Node::Object()->simParameters;
00973 
00974   //execute only during GBIS phase 1, or if not using GBIS
00975   if (!simParams->GBISOn || gbisPhase == 1) {
00976 
00977     if ( atomsChanged || computesChanged ) {
00978       // Invalidate pair lists
00979       pairlistsValid = false;
00980       pairlistTolerance = 0.0f;
00981     }
00982 
00983     // Get maximum atom movement and patch tolerance
00984     float maxAtomMovement = 0.0f;
00985     float maxPatchTolerance = 0.0f;
00986     getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
00987     // Update pair-list cutoff
00988     Flags &flags = patches[0].patch->flags;
00989     savePairlists = false;
00990     usePairlists = false;
00991     if ( flags.savePairlists ) {
00992       savePairlists = true;
00993       usePairlists = true;
00994     } else if ( flags.usePairlists ) {
00995       if ( ! pairlistsValid ||
00996            ( 2. * maxAtomMovement > pairlistTolerance ) ) {
00997         reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
00998       } else {
00999         usePairlists = true;
01000       }
01001     }
01002     if ( ! usePairlists ) {
01003       pairlistsValid = false;
01004     }
01005     float plcutoff = cutoff;
01006     if ( savePairlists ) {
01007       pairlistsValid = true;
01008       pairlistTolerance = 2. * maxPatchTolerance;
01009       plcutoff += pairlistTolerance;
01010     }
01011     plcutoff2 = plcutoff * plcutoff;
01012 
01013     // if (atomsChanged)
01014     //   CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01015     //     plcutoff, pairlistTolerance, savePairlists, usePairlists);
01016 
01017   } // if (!simParams->GBISOn || gbisPhase == 1)
01018 
01019   // Calculate PME & VdW forces
01020   if (!simParams->GBISOn || gbisPhase == 1) {
01021     doForce();
01022     if (doStreaming) {
01023       patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
01024       patchReadyQueueLen = tileListKernel.getNumPatches();
01025       patchReadyQueueNext = 0;
01026       // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
01027       int numEmptyPatches = tileListKernel.getNumEmptyPatches();
01028       int* emptyPatches = tileListKernel.getEmptyPatches();
01029       for (int i=0;i < numEmptyPatches;i++) {
01030         patchReadyQueue[i] = emptyPatches[i];
01031       }
01032       if (patchReadyQueueLen != patches.size())
01033         NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
01034     }
01035   }
01036 
01037   // For GBIS phase 1 at pairlist update, we must re-sort tile list
01038   // before calling doGBISphase1().
01039   if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
01040     // In this code path doGBISphase1() is called in forceDone()
01041     forceDoneSetCallback();
01042     return;
01043   }
01044 
01045   // GBIS Phases
01046   if (simParams->GBISOn) {
01047     if (gbisPhase == 1) {
01048       doGBISphase1();
01049     } else if (gbisPhase == 2) {
01050       doGBISphase2(); 
01051     } else if (gbisPhase == 3) {
01052       doGBISphase3(); 
01053     }
01054   }
01055 
01056   // Copy forces to host
01057   if (!simParams->GBISOn || gbisPhase == 3) {
01058     if (!doStreaming) {
01059       copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
01060       if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
01061     }
01062   }
01063 
01064   if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
01065     // For GBIS, energies are ready after phase 2
01066     nonbondedKernel.reduceVirialEnergy(tileListKernel,
01067       atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
01068       d_forces, d_forcesSlow, d_virialEnergy, stream);
01069     copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
01070   }
01071 
01072   // Setup call back
01073   forceDoneSetCallback();
01074 }
01075 
01076 //
01077 // GBIS Phase 1
01078 //
01079 void CudaComputeNonbonded::doGBISphase1() {
01080   cudaCheck(cudaSetDevice(deviceID));
01081   
01082   if (atomsChanged) {
01083     GBISKernel.updateIntRad(atomStorageSize, intRad0H, intRadSH, stream);
01084   }
01085 
01086   SimParameters *simParams = Node::Object()->simParameters;
01087   Lattice lattice = patches[0].patch->flags.lattice;
01088 
01089   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01090   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01091   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01092 
01093   GBISKernel.GBISphase1(tileListKernel, atomStorageSize,
01094     lata, latb, latc,
01095     simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream);
01096 }
01097 
01098 //
01099 // GBIS Phase 2
01100 //
01101 void CudaComputeNonbonded::doGBISphase2() {
01102   cudaCheck(cudaSetDevice(deviceID));
01103 
01104   SimParameters *simParams = Node::Object()->simParameters;
01105   Lattice lattice = patches[0].patch->flags.lattice;
01106 
01107   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01108   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01109   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01110 
01111   GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream);
01112 
01113   GBISKernel.GBISphase2(tileListKernel, atomStorageSize,
01114     doEnergy, doSlow,
01115     lata, latb, latc,
01116     simParams->cutoff, simParams->nonbondedScaling, simParams->kappa,
01117     (simParams->switchingActive ? simParams->switchingDist : -1.0),
01118     simParams->dielectric, simParams->solvent_dielectric,
01119     d_forces, dEdaSumH, stream);
01120 }
01121 
01122 //
01123 // GBIS Phase 3
01124 //
01125 void CudaComputeNonbonded::doGBISphase3() {
01126   cudaCheck(cudaSetDevice(deviceID));
01127   SimParameters *simParams = Node::Object()->simParameters;
01128   Lattice lattice = patches[0].patch->flags.lattice;
01129 
01130   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01131   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01132   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01133 
01134   if (doSlow) {
01135     GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream);
01136 
01137     GBISKernel.GBISphase3(tileListKernel, atomStorageSize,
01138       lata, latb, latc,
01139       simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream);
01140   }
01141 }
01142 
01143 //
01144 // Calculate electrostatic & VdW forces
01145 //
01146 void CudaComputeNonbonded::doForce() {
01147   cudaCheck(cudaSetDevice(deviceID));
01148 
01149   Lattice lattice = patches[0].patch->flags.lattice;
01150   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
01151   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
01152   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
01153 
01154   if (atomsChanged) {
01155     int numTileLists = calcNumTileLists();
01156     // Build initial tile lists and sort
01157     tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize,
01158       maxTileListLen, lata, latb, latc,
01159       cudaPatches, (const float4*)atoms, plcutoff2, stream);
01160     // Prepare tile list for atom-based refinement
01161     tileListKernel.prepareTileList(stream);
01162   }
01163 
01164   if (atomsChanged) {
01165     // Update Vdw types and exclusion index & maxdiff
01166     updateVdwTypesExcl();
01167   }
01168 
01169   beforeForceCompute = CkWallTimer();
01170 
01171   // Calculate forces (and refine tile list if atomsChanged=true)
01172   nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize, atomsChanged,
01173     doEnergy, doVirial, doSlow, lata, latb, latc,
01174     (const float4*)atoms, cutoff2, d_forces, d_forcesSlow, h_forces, h_forcesSlow,
01175     stream);
01176 
01177   if (atomsChanged) {
01178     tileListKernel.finishTileList(stream);
01179   }
01180 
01181   traceUserBracketEvent(CUDA_DEBUG_EVENT, beforeForceCompute, CkWallTimer());
01182 }
01183 
01184 //
01185 // Count an upper estimate for the number of tile lists
01186 //
01187 int CudaComputeNonbonded::calcNumTileLists() {
01188   int numTileLists = 0;
01189   for (int i=0;i < computes.size();i++) {
01190     int pi1 = computes[i].patchInd[0];
01191     int numAtoms1 = patches[pi1].numAtoms;
01192     int numTiles1 = (numAtoms1-1)/WARPSIZE+1;
01193     numTileLists += numTiles1;
01194   }
01195   return numTileLists;
01196 }
01197 
01198 //
01199 // Finish & submit reductions
01200 //
01201 void CudaComputeNonbonded::finishReductions() {
01202 
01203   if (CkMyPe() != masterPe)
01204     NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
01205   
01206   // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
01207 
01208   if (!doSkip) {
01209 
01210     if (doStreaming && (doVirial || doEnergy)) {
01211       // For streaming kernels, we must wait for virials and forces to be copied back to CPU
01212       if (!forceDoneEventRecord)
01213         NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
01214       cudaCheck(cudaEventSynchronize(forceDoneEvent));
01215       forceDoneEventRecord = false;
01216     }
01217 
01218     if (doVirial) {
01219       Tensor virialTensor;
01220       virialTensor.xx = h_virialEnergy->virial[0];
01221       virialTensor.xy = h_virialEnergy->virial[1];
01222       virialTensor.xz = h_virialEnergy->virial[2];
01223       virialTensor.yx = h_virialEnergy->virial[3];
01224       virialTensor.yy = h_virialEnergy->virial[4];
01225       virialTensor.yz = h_virialEnergy->virial[5];
01226       virialTensor.zx = h_virialEnergy->virial[6];
01227       virialTensor.zy = h_virialEnergy->virial[7];
01228       virialTensor.zz = h_virialEnergy->virial[8];
01229       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01230       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01231       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01232       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
01233       if (doSlow) {
01234         Tensor virialTensor;
01235         virialTensor.xx = h_virialEnergy->virialSlow[0];
01236         virialTensor.xy = h_virialEnergy->virialSlow[1];
01237         virialTensor.xz = h_virialEnergy->virialSlow[2];
01238         virialTensor.yx = h_virialEnergy->virialSlow[3];
01239         virialTensor.yy = h_virialEnergy->virialSlow[4];
01240         virialTensor.yz = h_virialEnergy->virialSlow[5];
01241         virialTensor.zx = h_virialEnergy->virialSlow[6];
01242         virialTensor.zy = h_virialEnergy->virialSlow[7];
01243         virialTensor.zz = h_virialEnergy->virialSlow[8];
01244         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
01245         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
01246         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
01247         ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
01248       }
01249     }
01250     if (doEnergy) {
01251       // if (doSlow)
01252       //   printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
01253       SimParameters *simParams = Node::Object()->simParameters;
01254       reduction->item(REDUCTION_LJ_ENERGY)    += h_virialEnergy->energyVdw;
01255       reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
01256       // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
01257       if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
01258       // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
01259     }
01260 
01261     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
01262   }
01263   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
01264   reduction->submit();
01265 
01266   // Reset flags
01267   doSkip = false;
01268   computesChanged = false;
01269 }
01270 
01271 //
01272 // Finish a single patch
01273 //
01274 void CudaComputeNonbonded::finishPatch(int i) {
01275   if (CkMyPe() != patches[i].pe)
01276     NAMD_bug("CudaComputeNonbonded::finishPatch called on wrong Pe");
01277 
01278   PatchRecord &pr = patches[i];
01279   pr.results = pr.forceBox->open();
01280 
01281   const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
01282   int atomStart = pr.atomStart;
01283   int numAtoms = pr.numAtoms;
01284   if (numAtoms > 0) {
01285     Force *f      = pr.results->f[Results::nbond];
01286     Force *f_slow = pr.results->f[Results::slow];
01287     float4 *af      = h_forces + atomStart;
01288     float4 *af_slow = h_forcesSlow + atomStart;
01289     // float maxf = 0.0f;
01290     // int maxf_k;
01291     for ( int k=0; k<numAtoms; ++k ) {
01292       int j = aExt[k].sortOrder;
01293       f[j].x += af[k].x;
01294       f[j].y += af[k].y;
01295       f[j].z += af[k].z;
01296       // if (maxf < fabsf(af[k].x) || maxf < fabsf(af[k].y) || maxf < fabsf(af[k].z)) {
01297       //   maxf = std::max(maxf, fabsf(af[k].x));
01298       //   maxf = std::max(maxf, fabsf(af[k].y));
01299       //   maxf = std::max(maxf, fabsf(af[k].z));
01300       //   maxf_k = k;
01301       // }
01302       if ( doSlow ) {
01303         f_slow[j].x += af_slow[k].x;
01304         f_slow[j].y += af_slow[k].y;
01305         f_slow[j].z += af_slow[k].z;
01306       }
01307     }
01308     // if (maxf > 10000.0f) {
01309     //   fprintf(stderr, "%d %f %f %f\n", maxf_k, af[maxf_k].x, af[maxf_k].y, af[maxf_k].z);
01310     //   cudaCheck(cudaStreamSynchronize(stream));
01311     //   NAMD_die("maxf!");
01312     // }
01313   }
01314 
01315   pr.positionBox->close(&(pr.compAtom));
01316   pr.forceBox->close(&(pr.results));
01317 }
01318 
01319 //
01320 // Finish a set of patches on this pe
01321 //
01322 void CudaComputeNonbonded::finishSetOfPatchesOnPe(std::vector<int>& patchSet) {
01323   if (patchSet.size() == 0)
01324     NAMD_bug("CudaComputeNonbonded::finishPatchesOnPe, empty rank");
01325   SimParameters *simParams = Node::Object()->simParameters;
01326   // Save value of gbisPhase here because it can change after the last finishGBISPhase() or finishPatch() is called
01327   int gbisPhaseSave = gbisPhase;
01328   // Close Boxes depending on Phase
01329   if (simParams->GBISOn) {
01330     for (int i=0;i < patchSet.size();i++) {
01331       finishGBISPhase(patchSet[i]);
01332     }
01333   }
01334   // Finish patches
01335   if (!simParams->GBISOn || gbisPhaseSave == 3) {
01336     for (int i=0;i < patchSet.size();i++) {
01337       finishPatch(patchSet[i]);
01338     }
01339   }
01340   bool done = false;
01341   CmiLock(lock);
01342   patchesCounter -= patchSet.size();
01343   if (patchesCounter == 0) {
01344     patchesCounter = getNumPatches();
01345     done = true;
01346   }
01347   CmiUnlock(lock);
01348   if (done) {
01349     // Do reductions
01350     if (!simParams->GBISOn || gbisPhaseSave == 3) {
01351       // Reduction must be done on masterPe
01352       computeMgr->sendFinishReductions(masterPe, this);
01353     }
01354   }
01355 }
01356 
01357 //
01358 // Finish all patches that are on this pe
01359 //
01360 void CudaComputeNonbonded::finishPatchesOnPe() {
01361   finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
01362 }
01363 
01364 //
01365 // Finish single patch on this pe
01366 //
01367 void CudaComputeNonbonded::finishPatchOnPe(int i) {
01368   std::vector<int> v(1, i);
01369   finishSetOfPatchesOnPe(v);
01370 }
01371 
01372 void CudaComputeNonbonded::finishPatches() {
01373   computeMgr->sendFinishPatchesOnPe(pes, this);
01374 }
01375 
01376 void CudaComputeNonbonded::finishGBISPhase(int i) {
01377   if (CkMyPe() != patches[i].pe)
01378     NAMD_bug("CudaComputeNonbonded::finishGBISPhase called on wrong Pe");
01379   PatchRecord &pr = patches[i];
01380   const CompAtomExt *aExt = pr.patch->getCompAtomExtInfo();
01381   int atomStart = pr.atomStart;
01382   if (gbisPhase == 1) {
01383     GBReal *psiSumMaster = psiSumH + atomStart;
01384     for ( int k=0; k<pr.numAtoms; ++k ) {
01385       int j = aExt[k].sortOrder;
01386       pr.psiSum[j] += psiSumMaster[k];
01387     }
01388     pr.psiSumBox->close(&(pr.psiSum));
01389   } else if (gbisPhase == 2) {
01390     GBReal *dEdaSumMaster = dEdaSumH + atomStart;
01391     for ( int k=0; k<pr.numAtoms; ++k ) {
01392       int j = aExt[k].sortOrder;
01393       pr.dEdaSum[j] += dEdaSumMaster[k];
01394     }
01395     pr.dEdaSumBox->close(&(pr.dEdaSum));
01396   } else if (gbisPhase == 3) {
01397     pr.intRadBox->close(&(pr.intRad)); //box 6
01398     pr.bornRadBox->close(&(pr.bornRad)); //box 7
01399     pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01400   } //end phases
01401 }
01402 
01403 void CudaComputeNonbonded::finishTimers() {
01404   SimParameters *simParams = Node::Object()->simParameters;
01405 
01406   if (simParams->GBISOn) {
01407     if (gbisPhase == 1)
01408       traceUserBracketEvent(CUDA_GBIS1_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
01409     if (gbisPhase == 2)
01410       traceUserBracketEvent(CUDA_GBIS2_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
01411     if (gbisPhase == 3)
01412       traceUserBracketEvent(CUDA_GBIS3_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
01413   } else {
01414     traceUserBracketEvent(CUDA_NONBONDED_KERNEL_EVENT, beforeForceCompute, CkWallTimer());
01415   }  
01416 }
01417 
01418 //
01419 // Re-sort tile lists if neccessary
01420 //
01421 void CudaComputeNonbonded::reSortTileLists() {
01422   // Re-sort tile lists
01423   SimParameters *simParams = Node::Object()->simParameters;
01424   cudaCheck(cudaSetDevice(deviceID));
01425   tileListKernel.reSortTileLists(simParams->GBISOn, stream);
01426 }
01427 
01428 void CudaComputeNonbonded::forceDoneCheck(void *arg, double walltime) {
01429   CudaComputeNonbonded* c = (CudaComputeNonbonded *)arg;
01430 
01431   if (CkMyPe() != c->masterPe)
01432     NAMD_bug("CudaComputeNonbonded::forceDoneCheck called on non masterPe");
01433 
01434   SimParameters *simParams = Node::Object()->simParameters;
01435   cudaCheck(cudaSetDevice(c->deviceID));
01436 
01437   if (c->doStreaming) {
01438     int patchInd;
01439     while ( -1 != (patchInd = c->patchReadyQueue[c->patchReadyQueueNext]) ) {
01440       c->patchReadyQueue[c->patchReadyQueueNext] = -1;
01441       c->patchReadyQueueNext++;
01442       c->checkCount = 0;
01443 
01444       if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) {
01445         c->finishTimers();
01446         if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
01447           c->reSortTileLists();
01448           c->reSortDone = true;
01449           if (simParams->GBISOn && c->gbisPhase == 1) {
01450             // We must do GBIS Phase 1
01451             c->doGBISphase1();
01452             c->forceDoneSetCallback();
01453             return;
01454           }
01455         }
01456       }
01457 
01458       // Finish patch
01459       int pe = c->patches[patchInd].pe;
01460       PatchID patchID = c->patches[patchInd].patchID;  // for priority
01461       c->computeMgr->sendFinishPatchOnPe(pe, c, patchInd, patchID);
01462 
01463       // Last patch, return
01464       if ( c->patchReadyQueueNext == c->patchReadyQueueLen ) return;
01465 
01466     }
01467   } else {
01468     if (!c->forceDoneEventRecord)
01469       NAMD_bug("CudaComputeNonbonded::forceDoneCheck, forceDoneEvent not being recorded");
01470     cudaError_t err = cudaEventQuery(c->forceDoneEvent);
01471     if (err == cudaSuccess) {
01472       // Event has occurred
01473       c->forceDoneEventRecord = false;
01474       c->checkCount = 0;
01475       c->finishTimers();
01476       if (c->atomsChanged && (!simParams->GBISOn || c->gbisPhase == 1) && !c->reSortDone) {
01477         c->reSortTileLists();
01478         c->reSortDone = true;
01479         if (simParams->GBISOn && c->gbisPhase == 1) {
01480           // We must do GBIS Phase 1
01481           c->doGBISphase1();
01482           c->forceDoneSetCallback();
01483           return;
01484         }
01485       }
01486       c->finishPatches();
01487       return;
01488     } else if (err != cudaErrorNotReady) {
01489       // Anything else is an error
01490       cudaCheck(err);
01491       // NAMD_bug("CudaComputeNonbonded::forceDoneCheck, cudaEventQuery returned error");
01492     }
01493   }
01494 
01495   // if (c->checkCount % 1000 == 0)
01496   //   fprintf(stderr, "c->patchReadyQueueNext %d\n", c->patchReadyQueueNext);
01497 
01498   // Event has not occurred
01499   c->checkCount++;
01500   if (c->checkCount >= 1000000) {
01501     NAMD_bug("CudaComputeNonbonded::forceDoneCheck, check count exceeded");
01502   }
01503 
01504   // Call again 
01505   CcdCallBacksReset(0, walltime);
01506   CcdCallFnAfter(forceDoneCheck, arg, 0.1);
01507 }
01508 
01509 //
01510 // Set call back for all the work in the stream at this point
01511 //
01512 void CudaComputeNonbonded::forceDoneSetCallback() {
01513   if (CkMyPe() != masterPe)
01514     NAMD_bug("CudaComputeNonbonded::forceDoneSetCallback called on non masterPe");
01515   beforeForceCompute = CkWallTimer();
01516   cudaCheck(cudaSetDevice(deviceID));
01517   if (!doStreaming || doVirial || doEnergy) {
01518     cudaCheck(cudaEventRecord(forceDoneEvent, stream));
01519     forceDoneEventRecord = true;
01520   }
01521   checkCount = 0;
01522   CcdCallBacksReset(0, CmiWallTimer());
01523   // Set the call back at 0.1ms
01524   CcdCallFnAfter(forceDoneCheck, this, 0.1);
01525 }
01526 
01527 struct cr_sortop_distance {
01528   const Lattice &l;
01529   cr_sortop_distance(const Lattice &lattice) : l(lattice) { }
01530   bool operator() (CudaComputeNonbonded::ComputeRecord i,
01531       CudaComputeNonbonded::ComputeRecord j) {
01532     Vector a = l.a();
01533     Vector b = l.b();
01534     Vector c = l.c();
01535     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01536     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01537     return ( ri < rj );
01538   }
01539 };
01540 
01541 static inline bool sortop_bitreverse(int a, int b) {
01542   if ( a == b ) return 0; 
01543   for ( int bit = 1; bit; bit *= 2 ) {
01544     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
01545   }
01546   return 0;
01547 }
01548 
01549 struct cr_sortop_reverse_priority {
01550   cr_sortop_distance &distop;
01551   const CudaComputeNonbonded::PatchRecord *pr;
01552   cr_sortop_reverse_priority(cr_sortop_distance &sod,
01553        const CudaComputeNonbonded::PatchRecord *patchrecs) : distop(sod), pr(patchrecs) { }
01554   bool pid_compare_priority(int2 pidi, int2 pidj) {
01555     const CudaComputeNonbonded::PatchRecord &pri = pr[pidi.y];
01556     const CudaComputeNonbonded::PatchRecord &prj = pr[pidj.y];
01557     if ( pri.isSamePhysicalNode && ! prj.isSamePhysicalNode ) return 0;
01558     if ( prj.isSamePhysicalNode && ! pri.isSamePhysicalNode ) return 1;
01559     if ( pri.isSameNode && ! prj.isSameNode ) return 0;
01560     if ( prj.isSameNode && ! pri.isSameNode ) return 1;
01561     if ( pri.isSameNode ) {  // and prj.isSameNode
01562       int rpri = pri.reversePriorityRankInPe;
01563       int rprj = prj.reversePriorityRankInPe;
01564       if ( rpri != rprj ) return rpri > rprj;
01565       return sortop_bitreverse(CkRankOf(pri.pe),CkRankOf(prj.pe));
01566     }
01567     int ppi = PATCH_PRIORITY(pidi.x);
01568     int ppj = PATCH_PRIORITY(pidj.x);
01569     if ( ppi != ppj ) return ppi < ppj;
01570     return pidi.x < pidj.x;
01571   }
01572   bool operator() (CudaComputeNonbonded::ComputeRecord j,
01573       CudaComputeNonbonded::ComputeRecord i) {  // i and j reversed
01574     // Choose patch i (= patch with greater priority)
01575     int2 pidi = pid_compare_priority(make_int2(i.pid[0], i.patchInd[0]), make_int2(i.pid[1], i.patchInd[1])) ? make_int2(i.pid[0], i.patchInd[0]) : make_int2(i.pid[1], i.patchInd[1]);
01576     // Choose patch j
01577     int2 pidj = pid_compare_priority(make_int2(j.pid[0], j.patchInd[0]), make_int2(j.pid[1], j.patchInd[1])) ? make_int2(j.pid[0], j.patchInd[0]) : make_int2(j.pid[1], j.patchInd[1]);
01578     if ( pidi.x != pidj.x ) return pid_compare_priority(pidi, pidj);
01579     return distop(i,j);
01580   }
01581 };
01582 
01583 //
01584 // Setup computes. This is only done at the beginning and at load balancing, hence the lack of
01585 // consideration for performance in the CPU->GPU memory copy.
01586 //
01587 void CudaComputeNonbonded::updateComputes() {
01588   cudaCheck(cudaSetDevice(deviceID));
01589 
01590   Lattice lattice = patches[0].patch->flags.lattice;
01591   cr_sortop_distance so(lattice);
01592   std::stable_sort(computes.begin(), computes.end(), so);
01593 
01594   if (doStreaming) {
01595     cr_sortop_reverse_priority sorp(so, patches.data());
01596     std::stable_sort(computes.begin(), computes.end(), sorp);
01597   }
01598 
01599   CudaComputeRecord* cudaComputes = new CudaComputeRecord[computes.size()];
01600 
01601   for (int i=0;i < computes.size();i++) {
01602     cudaComputes[i].patchInd.x = computes[i].patchInd[0];
01603     cudaComputes[i].patchInd.y = computes[i].patchInd[1];
01604     cudaComputes[i].offsetXYZ.x = computes[i].offset.x;
01605     cudaComputes[i].offsetXYZ.y = computes[i].offset.y;
01606     cudaComputes[i].offsetXYZ.z = computes[i].offset.z;
01607   }
01608 
01609   tileListKernel.updateComputes(computes.size(), cudaComputes, stream);
01610   cudaCheck(cudaStreamSynchronize(stream));
01611 
01612   delete [] cudaComputes;
01613 }
01614 
01615 struct exlist_sortop {
01616   bool operator() (int32 *li, int32 *lj) {
01617     return ( li[1] < lj[1] );
01618   }
01619 };
01620 
01621 //
01622 // Builds the exclusions table. Swiped from ComputeNonbondedCUDA.C
01623 //
01624 void CudaComputeNonbonded::buildExclusions() {
01625   cudaCheck(cudaSetDevice(deviceID));
01626 
01627   Molecule *mol = Node::Object()->molecule;
01628 
01629 #ifdef MEM_OPT_VERSION
01630   int natoms = mol->exclSigPoolSize;
01631 #else
01632   int natoms = mol->numAtoms; 
01633 #endif
01634 
01635         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
01636   exclusionsByAtom = new int2[natoms];
01637 
01638   // create unique sorted lists
01639 
01640   ObjectArena<int32> listArena;
01641   ResizeArray<int32*> unique_lists;
01642   int32 **listsByAtom = new int32*[natoms];
01643   SortableResizeArray<int32> curList;
01644   for ( int i=0; i<natoms; ++i ) {
01645     curList.resize(0);
01646     curList.add(0);  // always excluded from self
01647 #ifdef MEM_OPT_VERSION
01648     const ExclusionSignature *sig = mol->exclSigPool + i;
01649     int n = sig->fullExclCnt;
01650     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
01651     n += 1;
01652 #else
01653     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
01654     int n = mol_list[0] + 1;
01655     for ( int j=1; j<n; ++j ) {
01656       curList.add(mol_list[j] - i);
01657     }
01658 #endif
01659     curList.sort();
01660 
01661     int j;
01662     for ( j=0; j<unique_lists.size(); ++j ) {
01663       if ( n != unique_lists[j][0] ) continue;  // no match
01664       int k;
01665       for ( k=0; k<n; ++k ) {
01666         if ( unique_lists[j][k+3] != curList[k] ) break;
01667       }
01668       if ( k == n ) break;  // found match
01669     }
01670     if ( j == unique_lists.size() ) {  // no match
01671       int32 *list = listArena.getNewArray(n+3);
01672       list[0] = n;
01673       int maxdiff = 0;
01674       maxdiff = -1 * curList[0];
01675       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
01676       list[1] = maxdiff;
01677       for ( int k=0; k<n; ++k ) {
01678         list[k+3] = curList[k];
01679       }
01680       unique_lists.add(list);
01681     }
01682     listsByAtom[i] = unique_lists[j];
01683   }
01684   // sort lists by maxdiff
01685   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
01686   long int totalbits = 0;
01687   int nlists = unique_lists.size();
01688   for ( int j=0; j<nlists; ++j ) {
01689     int32 *list = unique_lists[j];
01690     int maxdiff = list[1];
01691     list[2] = totalbits + maxdiff;
01692     totalbits += 2*maxdiff + 1;
01693   }
01694   for ( int i=0; i<natoms; ++i ) {
01695     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
01696     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
01697   }
01698   delete [] listsByAtom;
01699 
01700   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
01701 
01702   {
01703     long int bytesneeded = totalbits / 8;
01704     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
01705     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
01706                 unique_lists.size(), bytesneeded);
01707     }
01708 
01709     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
01710     if ( bytesneeded > bytesavail ) {
01711       char errmsg[512];
01712       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
01713                      "but only %ld bytes can be addressed with 32-bit int.",
01714                      unique_lists.size(), bytesneeded, bytesavail);
01715       NAMD_die(errmsg);
01716     }
01717   }
01718 
01719 #define SET_EXCL(EXCL,BASE,DIFF) \
01720          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
01721 
01722   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
01723   memset(exclusion_bits, 0, totalbits/8);
01724 
01725   long int base = 0;
01726   for ( int i=0; i<unique_lists.size(); ++i ) {
01727     base += unique_lists[i][1];
01728     if ( unique_lists[i][2] != (int32)base ) {
01729       NAMD_bug("CudaComputeNonbonded::build_exclusions base != stored");
01730     }
01731     int n = unique_lists[i][0];
01732     for ( int j=0; j<n; ++j ) {
01733       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
01734     }
01735     base += unique_lists[i][1] + 1;
01736   }
01737 
01738   int numExclusions = totalbits/32;
01739 
01740         nonbondedKernel.bindExclusions(numExclusions, exclusion_bits);
01741 
01742   delete [] exclusion_bits;
01743 }
01744 
01745 #endif // NAMD_CUDA

Generated on Sat Feb 24 01:17:14 2018 for NAMD by  doxygen 1.4.7