ComputeBondedCUDA.C

Go to the documentation of this file.
00001 #include "charm++.h"
00002 #include "NamdTypes.h"
00003 #include "ComputeMgr.h"
00004 #include "WorkDistrib.h"
00005 #include "ProxyMgr.h"
00006 #include "CudaUtils.h"
00007 #include "DeviceCUDA.h"
00008 #include "ComputeBonds.h"
00009 #include "ComputeAngles.h"
00010 #include "ComputeDihedrals.h"
00011 #include "ComputeImpropers.h"
00012 #include "ComputeCrossterms.h"
00013 #include "ComputeNonbondedCUDAExcl.h"
00014 #include "ComputeBondedCUDA.h"
00015 
00016 #ifdef NAMD_CUDA
00017 #ifdef BONDED_CUDA
00018 
00019 #include <algorithm>   // std::find
00020 
00021 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
00022   sizeof(CudaBond),          // Bonds
00023   sizeof(CudaAngle),         // Angles
00024   sizeof(CudaDihedral),      // Dihedrals
00025   sizeof(CudaDihedral),      // Impropers
00026   sizeof(CudaExclusion),     // Exclusions
00027   sizeof(CudaCrossterm)      // Crossterms
00028 };
00029 
00030 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime);  // fix Charm++
00031 
00032 //
00033 // Class constructor
00034 //
00035 ComputeBondedCUDA::ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID,
00036   CudaNonbondedTables& cudaNonbondedTables) :
00037 Compute(c), computeMgr(computeMgr), deviceID(deviceID), masterPe(CkMyPe()),
00038 bondedKernel(deviceID, cudaNonbondedTables) 
00039 {
00040 
00041   computes.resize(CkMyNodeSize());
00042   patchIDsPerRank.resize(CkMyNodeSize());
00043   numExclPerRank.resize(CkMyNodeSize());
00044   for (int i=0;i < numExclPerRank.size();i++) {
00045     numExclPerRank[i].numModifiedExclusions = 0;
00046     numExclPerRank[i].numExclusions = 0;
00047   }
00048 
00049   atomMap.allocateMap(Node::Object()->molecule->numAtoms);
00050 
00051   flags = NULL;
00052 
00053   tupleData = NULL;
00054   tupleDataSize = 0;
00055 
00056   atoms = NULL;
00057   atomsSize = 0;
00058 
00059   forces = NULL;
00060   forcesSize = 0;
00061 
00062   energies_virials = NULL;
00063 
00064   initializeCalled = false;
00065 
00066   SimParameters *params = Node::Object()->simParameters;
00067   accelMDdoDihe = false;
00068   if (params->accelMDOn) {
00069      if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=true;
00070   }
00071 }
00072 
00073 //
00074 // Class destructor
00075 //
00076 ComputeBondedCUDA::~ComputeBondedCUDA() {
00077   cudaCheck(cudaSetDevice(deviceID));
00078 
00079   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
00080   if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
00081   if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
00082   if (tupleData != NULL) deallocate_host<char>(&tupleData);
00083 
00084   if (initializeCalled) {
00085     cudaCheck(cudaStreamDestroy(stream));
00086     cudaCheck(cudaEventDestroy(forceDoneEvent));
00087     CmiDestroyLock(lock);
00088     delete reduction;
00089   }
00090 
00091   // NOTE: unregistering happens in [sync] -entry method
00092   computeMgr->sendUnregisterBoxesOnPe(pes, this);
00093 }
00094 
00095 void ComputeBondedCUDA::unregisterBoxesOnPe() {
00096   for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
00097     PatchID patchID = patchIDsPerRank[CkMyRank()][i];
00098     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00099     if (tpe == NULL || tpe->p == NULL) {
00100       NAMD_bug("ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
00101     }
00102     Patch* patch = tpe->p;
00103     if (tpe->positionBox != NULL) patch->unregisterPositionPickup(this, &tpe->positionBox);
00104     if (tpe->avgPositionBox != NULL) patch->unregisterAvgPositionPickup(this, &tpe->avgPositionBox);
00105     if (tpe->forceBox != NULL) patch->unregisterForceDeposit(this, &tpe->forceBox);
00106   }
00107 }
00108 
00109 //
00110 // Register compute for a given PE. pids is a list of patches the PE has
00111 // Called by master PE
00112 //
00113 void ComputeBondedCUDA::registerCompute(int pe, int type, PatchIDList& pids) {
00114 
00115   if (CkMyPe() != masterPe)
00116     NAMD_bug("ComputeBondedCUDA::registerCompute() called on non master PE");
00117 
00118   int rank = CkRankOf(pe);
00119 
00120   HomeCompute& homeCompute = computes[rank].homeCompute;
00121   if (homeCompute.patchIDs.size() == 0) {
00122     homeCompute.isBasePatch.resize(PatchMap::Object()->numPatches(), 0);
00123     homeCompute.patchIDs.resize(pids.size());
00124     for (int i=0;i < pids.size();i++) {
00125       homeCompute.patchIDs[i] = pids[i];
00126       homeCompute.isBasePatch[pids[i]] = 1;
00127     }
00128   } else {
00129     if (homeCompute.patchIDs.size() != pids.size()) {
00130       NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
00131     }
00132     for (int i=0;i < pids.size();i++) {
00133       if (homeCompute.patchIDs[i] != pids[i]) {
00134         NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
00135       }
00136     }
00137   }
00138 
00139   switch(type) {
00140     case computeBondsType:
00141     homeCompute.tuples.push_back(new HomeTuples<BondElem, Bond, BondValue>(Tuples::BOND));
00142     break;
00143 
00144     case computeAnglesType:
00145     homeCompute.tuples.push_back(new HomeTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE));
00146     break;
00147 
00148     case computeDihedralsType:
00149     homeCompute.tuples.push_back(new HomeTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL));
00150     break;
00151 
00152     case computeImpropersType:
00153     homeCompute.tuples.push_back(new HomeTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER));
00154     break;
00155 
00156     case computeExclsType:
00157     homeCompute.tuples.push_back(new HomeTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION));
00158     break;
00159 
00160     case computeCrosstermsType:
00161     homeCompute.tuples.push_back(new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM));
00162     break;
00163 
00164     default:
00165     NAMD_bug("ComputeBondedCUDA::registerCompute(), Unsupported compute type");
00166     break;
00167   }
00168 
00169 }
00170 
00171 //
00172 // Register self compute for a given PE
00173 // Called by master PE
00174 //
00175 void ComputeBondedCUDA::registerSelfCompute(int pe, int type, int pid) {
00176 
00177   if (CkMyPe() != masterPe)
00178     NAMD_bug("ComputeBondedCUDA::registerSelfCompute() called on non master PE");
00179 
00180   int rank = CkRankOf(pe);
00181 
00182   std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
00183   auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
00184   if (it == selfComputes.end()) {
00185     // Type not found, add new one
00186     selfComputes.push_back(SelfCompute(type));
00187     it = selfComputes.begin() + (selfComputes.size() - 1);
00188 
00189     switch(type) {
00190       case computeSelfBondsType:
00191       it->tuples = new SelfTuples<BondElem, Bond, BondValue>(Tuples::BOND);
00192       break;
00193 
00194       case computeSelfAnglesType:
00195       it->tuples = new SelfTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE);
00196       break;
00197 
00198       case computeSelfDihedralsType:
00199       it->tuples = new SelfTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL);
00200       break;
00201 
00202       case computeSelfImpropersType:
00203       it->tuples = new SelfTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER);
00204       break;
00205 
00206       case computeSelfExclsType:
00207       it->tuples = new SelfTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION);
00208       break;
00209 
00210       case computeSelfCrosstermsType:
00211       it->tuples = new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM);
00212       break;
00213 
00214       default:
00215       NAMD_bug("ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
00216       break;
00217     }
00218 
00219   }
00220 
00221   // Add patch ID for this type
00222   it->patchIDs.push_back(pid);
00223 }
00224 
00225 void ComputeBondedCUDA::assignPatchesOnPe() {
00226 
00227   PatchMap* patchMap = PatchMap::Object();
00228   for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
00229     PatchID patchID = patchIDsPerRank[CkMyRank()][i];
00230     ProxyMgr::Object()->createProxy(patchID);
00231     Patch* patch = patchMap->patch(patchID);
00232     if (patch == NULL)
00233       NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, patch not found");
00234     if (flags == NULL) flags = &patchMap->patch(patchID)->flags;
00235     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00236     if (tpe == NULL) {
00237       NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
00238     }
00239     if (tpe->p != NULL) {
00240       NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
00241     }
00242     // Assign patch and register coordinates and forces manually
00243     tpe->p = patch;
00244     tpe->positionBox = patch->registerPositionPickup(this);
00245     tpe->avgPositionBox = patch->registerAvgPositionPickup(this);
00246     tpe->forceBox = patch->registerForceDeposit(this);
00247   }
00248 }
00249 
00250 //
00251 // atomUpdate() can be called by any Pe
00252 //
00253 void ComputeBondedCUDA::atomUpdate() {
00254   atomsChangedIn = true;
00255 }
00256 
00257 //
00258 // Enqueu doWork on masterPe and return "no work"
00259 // Can be called by any Pe
00260 //
00261 int ComputeBondedCUDA::noWork() {
00262   computeMgr->sendMessageEnqueueWork(masterPe, this);
00263   return 1;
00264 }
00265 
00266 void ComputeBondedCUDA::messageEnqueueWork() {
00267   if (masterPe != CkMyPe())
00268     NAMD_bug("ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
00269   WorkDistrib::messageEnqueueWork(this);
00270 }
00271 
00272 //
00273 // Sends open-box commands to PEs
00274 // Called on master PE
00275 //
00276 void ComputeBondedCUDA::doWork() {
00277 
00278   if (CkMyPe() != masterPe)
00279     NAMD_bug("ComputeBondedCUDA::doWork() called on non master PE");
00280 
00281   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
00282   // atomsChangedIn can be set to true by any Pe
00283   // atomsChanged can only be set by masterPe
00284   // This use of double varibles makes sure we don't have race condition
00285   atomsChanged = atomsChangedIn;
00286   atomsChangedIn = false;
00287 
00288   if (getNumPatches() == 0) return;  // No work do to
00289 
00290   if (flags == NULL)
00291     NAMD_bug("ComputeBondedCUDA::doWork(), no flags set");
00292 
00293   // Read flags
00294   lattice = flags->lattice;
00295   doEnergy = flags->doEnergy;
00296   doVirial = flags->doVirial;
00297   doSlow   = flags->doFullElectrostatics;
00298   doMolly  = flags->doMolly;
00299 
00300   if (atomsChanged) {
00301     // Re-calculate patch atom numbers and storage
00302     updatePatches();
00303   }
00304 
00305   // Open boxes on Pes and launch work to masterPe
00306   computeMgr->sendOpenBoxesOnPe(pes, this);
00307 }
00308 
00309 //
00310 // This gets called when patch finishes on a PE
00311 //
00312 void ComputeBondedCUDA::patchReady(PatchID pid, int doneMigration, int seq) {
00313   if (doneMigration) {
00314     // auto it = patchIndex.find(pid);
00315     // if (it == patchIndex.end())
00316     //   NAMD_bug("ComputeBondedCUDA::patchReady, Patch ID not found");
00317     patches[patchIndex[pid]].numAtoms = PatchMap::Object()->patch(pid)->getNumAtoms();
00318   }
00319   CmiLock(lock);
00320   Compute::patchReady(pid, doneMigration, seq);
00321   CmiUnlock(lock);
00322 }
00323 
00324 //
00325 //
00326 //
00327 void ComputeBondedCUDA::updatePatches() {
00328   int atomStart = 0;
00329   for (int i=0;i < patches.size();i++) {
00330     patches[i].atomStart = atomStart;
00331     atomStart += patches[i].numAtoms;
00332   }
00333   atomStorageSize = atomStart;
00334 
00335   // Re-allocate atoms
00336   reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
00337 }
00338 
00339 //
00340 // Map atoms GPU-wide
00341 //
00342 void ComputeBondedCUDA::mapAtoms() {
00343 
00344   for (int i=0;i < getNumPatches();i++) {
00345     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
00346     atomMappers[i]->registerIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
00347   }
00348 
00349 }
00350 
00351 //
00352 // Unmap atoms GPU-wide
00353 //
00354 void ComputeBondedCUDA::unmapAtoms() {
00355 
00356   for (int i=0;i < getNumPatches();i++) {
00357     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
00358     atomMappers[i]->unregisterIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
00359   }
00360 
00361 }
00362 
00363 //
00364 // Open all patches that have been assigned to this Pe
00365 //
00366 void ComputeBondedCUDA::openBoxesOnPe() {
00367 
00368   std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
00369 
00370   for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
00371     PatchID patchID = *it;
00372     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00373     tpe->x = tpe->positionBox->open();
00374     tpe->xExt = tpe->p->getCompAtomExtInfo();
00375     if ( doMolly ) tpe->x_avg = tpe->avgPositionBox->open();
00376     tpe->r = tpe->forceBox->open();
00377     tpe->f = tpe->r->f[Results::normal];
00378     if (accelMDdoDihe) tpe->af = tpe->r->f[Results::amdf]; // for dihedral-only or dual-boost accelMD
00379 
00380     // Copy atoms
00381     int pi = patchIndex[patchID];
00382     int atomStart = patches[pi].atomStart;
00383     int numAtoms = patches[pi].numAtoms;
00384     CompAtom* compAtom = tpe->x;
00385     const CompAtomExt *aExt = tpe->p->getCompAtomExtInfo();
00386     const CudaAtom *src = tpe->p->getCudaAtomList();
00387     for (int i=0;i < numAtoms;i++) {
00388       int j = aExt[i].sortOrder;
00389       atoms[atomStart + j] = src[i];
00390     }
00391 
00392   }
00393 
00394   bool done = false;
00395   CmiLock(lock);
00396   patchesCounter -= patchIDs.size();
00397   if (patchesCounter == 0) {
00398     patchesCounter = getNumPatches();
00399     done = true;
00400   }
00401   CmiUnlock(lock);
00402   if (done) {
00403     if (atomsChanged) {
00404       mapAtoms();
00405       computeMgr->sendLoadTuplesOnPe(pes, this);
00406     } else {
00407       computeMgr->sendLaunchWork(masterPe, this);
00408     }
00409   }
00410 }
00411 
00412 void countNumExclusions(Tuples* tuples, int& numModifiedExclusions, int& numExclusions) {
00413   numModifiedExclusions = 0;
00414   int ntuples = tuples->getNumTuples();
00415   ExclElem* src = (ExclElem *)(tuples->getTupleList());
00416   for (int ituple=0;ituple < ntuples;ituple++) {
00417     if (src[ituple].modified) numModifiedExclusions++;
00418   }
00419   numExclusions = ntuples - numModifiedExclusions;
00420 }
00421 
00422 //
00423 // Load tuples on PE. Note: this can only after boxes on all PEs have been opened
00424 //
00425 void ComputeBondedCUDA::loadTuplesOnPe() {
00426 
00427   int numModifiedExclusions = 0;
00428   int numExclusions = 0;
00429 
00430   std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
00431   // Loop over self compute types
00432   for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
00433     it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
00434     // For exclusions, we must count the number of modified and non-modified exclusions
00435     if (it->tuples->getType() == Tuples::EXCLUSION) {
00436       int tmp1, tmp2;
00437       countNumExclusions(it->tuples, tmp1, tmp2);
00438       numModifiedExclusions += tmp1;
00439       numExclusions += tmp2;
00440     }
00441   }
00442 
00443   HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
00444   for (int i=0;i < homeCompute.tuples.size();i++) {
00445     homeCompute.tuples[i]->loadTuples(tuplePatchList,
00446       homeCompute.isBasePatch.data(), &atomMap,
00447       homeCompute.patchIDs);
00448     // For exclusions, we must count the number of modified and non-modified exclusions
00449     if (homeCompute.tuples[i]->getType() == Tuples::EXCLUSION) {
00450       int tmp1, tmp2;
00451       countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
00452       numModifiedExclusions += tmp1;
00453       numExclusions += tmp2;
00454     }
00455   }
00456 
00457   // Store number of exclusions
00458   numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
00459   numExclPerRank[CkMyRank()].numExclusions         = numExclusions;
00460 
00461   bool done = false;
00462   CmiLock(lock);
00463   patchesCounter -= patchIDsPerRank[CkMyRank()].size();
00464   if (patchesCounter == 0) {
00465     patchesCounter = getNumPatches();
00466     done = true;
00467   }
00468   CmiUnlock(lock);
00469   if (done) {
00470     computeMgr->sendLaunchWork(masterPe, this);
00471   }
00472 }
00473 
00474 void ComputeBondedCUDA::copyBondData(const int ntuples, const BondElem* __restrict__ src,
00475   const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
00476 
00477   PatchMap* patchMap = PatchMap::Object();
00478   for (int ituple=0;ituple < ntuples;ituple++) {
00479     CudaBond dstval;
00480     auto p0 = src[ituple].p[0];
00481     auto p1 = src[ituple].p[1];
00482     int pi0 = patchIndex[p0->patchID];
00483     int pi1 = patchIndex[p1->patchID];
00484     int l0 = src[ituple].localIndex[0];
00485     int l1 = src[ituple].localIndex[1];
00486     dstval.i = l0 + patches[pi0].atomStart;
00487     dstval.j = l1 + patches[pi1].atomStart;
00488     dstval.itype = (src[ituple].value - bond_array);
00489     Position position1 = p0->x[l0].position;
00490     Position position2 = p1->x[l1].position;
00491     Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
00492     shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00493     dstval.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
00494     dstval.scale = src[ituple].scale;
00495     dst[ituple] = dstval;
00496   }
00497 }
00498 
00499 void ComputeBondedCUDA::copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
00500   const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst) {
00501 
00502   PatchMap* patchMap = PatchMap::Object();
00503   for (int ituple=0;ituple < ntuples;ituple++) {
00504     CudaAngle dstval;
00505     auto p0 = src[ituple].p[0];
00506     auto p1 = src[ituple].p[1];
00507     auto p2 = src[ituple].p[2];
00508     int pi0 = patchIndex[p0->patchID];
00509     int pi1 = patchIndex[p1->patchID];
00510     int pi2 = patchIndex[p2->patchID];
00511     int l0 = src[ituple].localIndex[0];
00512     int l1 = src[ituple].localIndex[1];
00513     int l2 = src[ituple].localIndex[2];
00514     dstval.i = l0 + patches[pi0].atomStart;
00515     dstval.j = l1 + patches[pi1].atomStart;
00516     dstval.k = l2 + patches[pi2].atomStart;
00517     dstval.itype = (src[ituple].value - angle_array);
00518     Position position1 = p0->x[l0].position;
00519     Position position2 = p1->x[l1].position;
00520     Position position3 = p2->x[l2].position;
00521     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
00522     Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
00523     shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00524     shiftVec32 += patchMap->center(p2->patchID) - patchMap->center(p1->patchID);
00525     dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
00526     dstval.koffsetXYZ = make_float3((float)shiftVec32.x, (float)shiftVec32.y, (float)shiftVec32.z);
00527     dstval.scale = src[ituple].scale;
00528     dst[ituple] = dstval;
00529   }
00530 }
00531 
00532 //
00533 // Used for both dihedrals and impropers
00534 //
00535 template <bool doDihedral, typename T, typename P>
00536 void ComputeBondedCUDA::copyDihedralData(const int ntuples, const T* __restrict__ src,
00537   const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
00538 
00539   PatchMap* patchMap = PatchMap::Object();
00540   for (int ituple=0;ituple < ntuples;ituple++) {
00541     CudaDihedral dstval;
00542     auto p0 = src[ituple].p[0];
00543     auto p1 = src[ituple].p[1];
00544     auto p2 = src[ituple].p[2];
00545     auto p3 = src[ituple].p[3];
00546     int pi0 = patchIndex[p0->patchID];
00547     int pi1 = patchIndex[p1->patchID];
00548     int pi2 = patchIndex[p2->patchID];
00549     int pi3 = patchIndex[p3->patchID];
00550     int l0 = src[ituple].localIndex[0];
00551     int l1 = src[ituple].localIndex[1];
00552     int l2 = src[ituple].localIndex[2];
00553     int l3 = src[ituple].localIndex[3];
00554     dstval.i = l0 + patches[pi0].atomStart;
00555     dstval.j = l1 + patches[pi1].atomStart;
00556     dstval.k = l2 + patches[pi2].atomStart;
00557     dstval.l = l3 + patches[pi3].atomStart;
00558     if (doDihedral) {
00559       dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
00560     } else {
00561       dstval.itype = improperMultMap[(src[ituple].value - p_array)];
00562     }
00563     Position position1 = p0->x[l0].position;
00564     Position position2 = p1->x[l1].position;
00565     Position position3 = p2->x[l2].position;
00566     Position position4 = p3->x[l3].position;
00567     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
00568     Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
00569     Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
00570     shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00571     shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
00572     shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
00573     dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
00574     dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
00575     dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
00576     dstval.scale = src[ituple].scale;
00577     dst[ituple] = dstval;
00578   }
00579 }
00580 
00581 void ComputeBondedCUDA::copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
00582   CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int& pos, int& pos2) {
00583 
00584   PatchMap* patchMap = PatchMap::Object();
00585   for (int ituple=0;ituple < ntuples;ituple++) {
00586     auto p0 = src[ituple].p[0];
00587     auto p1 = src[ituple].p[1];
00588     int pi0 = patchIndex[p0->patchID];
00589     int pi1 = patchIndex[p1->patchID];
00590     int l0 = src[ituple].localIndex[0];
00591     int l1 = src[ituple].localIndex[1];
00592     CompAtom& ca1 = p0->x[l0];
00593     CompAtom& ca2 = p1->x[l1];
00594     Position position1 = ca1.position;
00595     Position position2 = ca2.position;
00596     Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
00597     shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00598     CudaExclusion ce;
00599     ce.i            = l0 + patches[pi0].atomStart;
00600     ce.j            = l1 + patches[pi1].atomStart;
00601     ce.vdwtypei    = ca1.vdwType;
00602     ce.vdwtypej    = ca2.vdwType;
00603     ce.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
00604     //
00605     if (src[ituple].modified) {
00606       *dst1 = ce;
00607       dst1++;
00608       pos += typeSize;
00609     } else {
00610       *dst2 = ce;
00611       dst2++;
00612       pos2 += typeSize;
00613     }
00614   }
00615 }
00616 
00617 void ComputeBondedCUDA::copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
00618   const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst) {
00619 
00620   PatchMap* patchMap = PatchMap::Object();
00621   for (int ituple=0;ituple < ntuples;ituple++) {
00622     auto p0 = src[ituple].p[0];
00623     auto p1 = src[ituple].p[1];
00624     auto p2 = src[ituple].p[2];
00625     auto p3 = src[ituple].p[3];
00626     auto p4 = src[ituple].p[4];
00627     auto p5 = src[ituple].p[5];
00628     auto p6 = src[ituple].p[6];
00629     auto p7 = src[ituple].p[7];
00630     int pi0 = patchIndex[p0->patchID];
00631     int pi1 = patchIndex[p1->patchID];
00632     int pi2 = patchIndex[p2->patchID];
00633     int pi3 = patchIndex[p3->patchID];
00634     int pi4 = patchIndex[p4->patchID];
00635     int pi5 = patchIndex[p5->patchID];
00636     int pi6 = patchIndex[p6->patchID];
00637     int pi7 = patchIndex[p7->patchID];
00638     int l0 = src[ituple].localIndex[0];
00639     int l1 = src[ituple].localIndex[1];
00640     int l2 = src[ituple].localIndex[2];
00641     int l3 = src[ituple].localIndex[3];
00642     int l4 = src[ituple].localIndex[4];
00643     int l5 = src[ituple].localIndex[5];
00644     int l6 = src[ituple].localIndex[6];
00645     int l7 = src[ituple].localIndex[7];
00646     dst[ituple].i1 = l0 + patches[pi0].atomStart;
00647     dst[ituple].i2 = l1 + patches[pi1].atomStart;
00648     dst[ituple].i3 = l2 + patches[pi2].atomStart;
00649     dst[ituple].i4 = l3 + patches[pi3].atomStart;
00650     dst[ituple].i5 = l4 + patches[pi4].atomStart;
00651     dst[ituple].i6 = l5 + patches[pi5].atomStart;
00652     dst[ituple].i7 = l6 + patches[pi6].atomStart;
00653     dst[ituple].i8 = l7 + patches[pi7].atomStart;
00654     dst[ituple].itype = (src[ituple].value - crossterm_array);
00655     Position position1 = p0->x[l0].position;
00656     Position position2 = p1->x[l1].position;
00657     Position position3 = p2->x[l2].position;
00658     Position position4 = p3->x[l3].position;
00659     Position position5 = p4->x[l4].position;
00660     Position position6 = p5->x[l5].position;
00661     Position position7 = p6->x[l6].position;
00662     Position position8 = p7->x[l7].position;
00663     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
00664     Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
00665     Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
00666     Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
00667     Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
00668     Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
00669     shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00670     shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
00671     shiftVec34 += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
00672     shiftVec56 += patchMap->center(p4->patchID) - patchMap->center(p5->patchID);
00673     shiftVec67 += patchMap->center(p5->patchID) - patchMap->center(p6->patchID);
00674     shiftVec78 += patchMap->center(p6->patchID) - patchMap->center(p7->patchID);
00675     dst[ituple].offset12XYZ = make_float3( (float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
00676     dst[ituple].offset23XYZ = make_float3( (float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
00677     dst[ituple].offset34XYZ = make_float3( (float)shiftVec34.x, (float)shiftVec34.y, (float)shiftVec34.z);
00678     dst[ituple].offset56XYZ = make_float3( (float)shiftVec56.x, (float)shiftVec56.y, (float)shiftVec56.z);
00679     dst[ituple].offset67XYZ = make_float3( (float)shiftVec67.x, (float)shiftVec67.y, (float)shiftVec67.z);
00680     dst[ituple].offset78XYZ = make_float3( (float)shiftVec78.x, (float)shiftVec78.y, (float)shiftVec78.z);
00681     dst[ituple].scale = src[ituple].scale;
00682   }
00683 }
00684 
00685 void ComputeBondedCUDA::tupleCopyWorker(int first, int last, void *result, int paraNum, void *param) {
00686   ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
00687   c->tupleCopyWorker(first, last);
00688 }
00689 
00690 void ComputeBondedCUDA::tupleCopyWorker(int first, int last) {
00691   if (first == -1) {
00692     // Separate exclusions into modified, and non-modified
00693     int pos = exclusionStartPos;
00694     int pos2 = exclusionStartPos2;
00695     for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
00696       int ntuples = (*it)->getNumTuples();
00697       copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
00698         (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
00699     }
00700     first = 0;
00701   }
00702   for (int i=first;i <= last;i++) {
00703     switch (tupleCopyWorkList[i].tupletype) {
00704 
00705       case Tuples::BOND:
00706       {
00707         copyBondData(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
00708           Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00709       }
00710       break;
00711 
00712       case Tuples::ANGLE:
00713       {
00714         copyAngleData(tupleCopyWorkList[i].ntuples, (AngleElem *)tupleCopyWorkList[i].tupleElemList,
00715           Node::Object()->parameters->angle_array, (CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00716       }
00717       break;
00718 
00719       case Tuples::DIHEDRAL:
00720       {
00721         copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
00722           (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
00723           (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00724       }
00725       break;
00726 
00727       case Tuples::IMPROPER:
00728       {
00729         copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
00730           (ImproperElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->improper_array,
00731           (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00732       }
00733       break;
00734 
00735       case Tuples::CROSSTERM:
00736       {
00737         copyCrosstermData(tupleCopyWorkList[i].ntuples, (CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
00738           Node::Object()->parameters->crossterm_array, (CudaCrossterm *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00739       }
00740       break;
00741 
00742       default:
00743       NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
00744       break;
00745     }
00746   }
00747 }
00748 
00749 //
00750 // Copies tuple data form individual buffers to a single contigious buffer
00751 // NOTE: This is done on the master PE
00752 //
00753 void ComputeBondedCUDA::copyTupleData() {
00754 
00755   PatchMap* patchMap = PatchMap::Object();
00756 
00757   // Count the number of exclusions
00758   int numModifiedExclusions = 0;
00759   int numExclusions = 0;
00760   for (int i=0;i < numExclPerRank.size();i++) {
00761     numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
00762     numExclusions         += numExclPerRank[i].numExclusions;
00763   }
00764   int numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
00765   int numExclusionsWA         = ComputeBondedCUDAKernel::warpAlign(numExclusions);
00766 
00767   // Count the number of tuples for each type
00768   int posWA = 0;
00769   exclusionStartPos = 0;
00770   exclusionStartPos2 = 0;
00771   tupleCopyWorkList.clear();
00772   for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
00773     // Take temporary position
00774     int pos = posWA;
00775     if (tupletype == Tuples::EXCLUSION) {
00776       exclusionStartPos = pos;
00777       exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
00778     }
00779     // Count for total number of tuples for this tupletype
00780     int num = 0;
00781     for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
00782       int ntuples = (*it)->getNumTuples();
00783       num += ntuples;
00784       if (tupletype != Tuples::EXCLUSION) {
00785         TupleCopyWork tupleCopyWork;
00786         tupleCopyWork.tupletype     = tupletype;
00787         tupleCopyWork.ntuples       = ntuples;
00788         tupleCopyWork.tupleElemList = (*it)->getTupleList();
00789         tupleCopyWork.tupleDataPos  = pos;
00790         tupleCopyWorkList.push_back(tupleCopyWork);
00791         pos += ntuples*CudaTupleTypeSize[tupletype];
00792       }
00793     }
00794     numTuplesPerType[tupletype] = num;
00795     //
00796     if (tupletype == Tuples::EXCLUSION) {
00797       // Warp-align exclusions separately
00798       posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
00799     } else {
00800       posWA += ComputeBondedCUDAKernel::warpAlign(num)*CudaTupleTypeSize[tupletype];
00801     }
00802   }
00803   if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
00804     NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
00805   }
00806 
00807   // Set flags for finishPatchesOnPe
00808   hasExclusions = (numExclusions > 0);
00809   hasModifiedExclusions = (numModifiedExclusions > 0);
00810 
00811   // Re-allocate storage as needed
00812   // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
00813   reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
00814 
00815 #if CMK_SMP && USE_CKLOOP
00816   int useCkLoop = Node::Object()->simParameters->useCkLoop;
00817   if (useCkLoop >= 1) {
00818     CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
00819   } else
00820 #endif
00821   {
00822     tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
00823   }
00824 
00825   bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
00826     numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
00827     numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
00828     tupleData, stream);
00829 
00830   // Re-allocate forces
00831   int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
00832   reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
00833 }
00834 
00835 //
00836 // Launch work on GPU
00837 //
00838 void ComputeBondedCUDA::launchWork() {
00839   if (CkMyPe() != masterPe)
00840     NAMD_bug("ComputeBondedCUDA::launchWork() called on non master PE");
00841 
00842   cudaCheck(cudaSetDevice(deviceID));
00843 
00844   if (atomsChanged) {
00845     copyTupleData();
00846   }
00847 
00848   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
00849   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
00850   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
00851 
00852   int r2_delta_expc = 64 * (ComputeNonbondedUtil::r2_delta_exp - 127);
00853 
00854   // Calculate forces
00855   bondedKernel.bondedForce(
00856     ComputeNonbondedUtil::scale14,
00857     atomStorageSize,
00858     doEnergy, doVirial, doSlow,
00859     lata, latb, latc,
00860     (float)ComputeNonbondedUtil::cutoff2,
00861     (float)ComputeNonbondedUtil::r2_delta, r2_delta_expc,
00862     (const float4*)atoms, forces,
00863     energies_virials,
00864     stream);
00865 
00866   forceDoneSetCallback();
00867 }
00868 
00869 void ComputeBondedCUDA::forceDoneCheck(void *arg, double walltime) {
00870   ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
00871 
00872   if (CkMyPe() != c->masterPe)
00873     NAMD_bug("ComputeBondedCUDA::forceDoneCheck called on non masterPe");
00874 
00875   cudaCheck(cudaSetDevice(c->deviceID));
00876 
00877   cudaError_t err = cudaEventQuery(c->forceDoneEvent);
00878   if (err == cudaSuccess) {
00879     // Event has occurred
00880     c->checkCount = 0;
00881     traceUserBracketEvent(CUDA_BONDED_KERNEL_EVENT, c->beforeForceCompute, CkWallTimer());
00882     c->finishPatches();
00883     return;
00884   } else if (err != cudaErrorNotReady) {
00885     // Anything else is an error
00886     NAMD_bug("ComputeBondedCUDA::forceDoneCheck, cudaEventQuery returned error");
00887   }
00888 
00889   // Event has not occurred
00890   c->checkCount++;
00891   if (c->checkCount >= 1000000) {
00892     NAMD_bug("ComputeBondedCUDA::forceDoneCheck, check count exceeded");
00893   }
00894 
00895   // Call again 
00896   CcdCallBacksReset(0, walltime);
00897   CcdCallFnAfter(forceDoneCheck, arg, 0.1);
00898 }
00899 
00900 //
00901 // Set call back for all the work in the stream at this point
00902 //
00903 void ComputeBondedCUDA::forceDoneSetCallback() {
00904   if (CkMyPe() != masterPe)
00905     NAMD_bug("ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
00906   cudaCheck(cudaSetDevice(deviceID));
00907   cudaCheck(cudaEventRecord(forceDoneEvent, stream));
00908   checkCount = 0;
00909   CcdCallBacksReset(0, CmiWallTimer());
00910   // Start timer for CUDA kernel
00911   beforeForceCompute = CkWallTimer();
00912   // Set the call back at 0.1ms
00913   CcdCallFnAfter(forceDoneCheck, this, 0.1);
00914 }
00915 
00916 inline void convertForceToDouble(const FORCE_TYPE *af, const int forceStride, double& afx, double& afy, double& afz) {
00917 #ifdef USE_STRIDED_FORCE
00918   FORCE_TYPE afxt = af[0];
00919   FORCE_TYPE afyt = af[forceStride];
00920   FORCE_TYPE afzt = af[forceStride*2];
00921 #else
00922   FORCE_TYPE afxt = af->x;
00923   FORCE_TYPE afyt = af->y;
00924   FORCE_TYPE afzt = af->z;
00925 #endif
00926 #ifdef USE_FP_FORCE
00927   afx = ((double)afxt)*force_to_double;
00928   afy = ((double)afyt)*force_to_double;
00929   afz = ((double)afzt)*force_to_double;
00930 #else
00931   afx = afxt;
00932   afy = afyt;
00933   afz = afzt;
00934 #endif
00935 }
00936 
00937 template <bool sumNbond, bool sumSlow>
00938 void finishForceLoop(const int numAtoms, const int forceStride,
00939   const FORCE_TYPE* __restrict__ af, const FORCE_TYPE* __restrict__ af_nbond, const FORCE_TYPE* __restrict__ af_slow, 
00940   Force* __restrict__ f, Force* __restrict__ f_nbond, Force* __restrict__ f_slow) {
00941 
00942   for (int j=0;j < numAtoms;j++) {
00943     {
00944       double afx, afy, afz;
00945       convertForceToDouble(af + j, forceStride, afx, afy, afz);
00946       f[j].x += afx;
00947       f[j].y += afy;
00948       f[j].z += afz;
00949     }
00950     if (sumNbond)
00951     {
00952       double afx, afy, afz;
00953       convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
00954       f_nbond[j].x += afx;
00955       f_nbond[j].y += afy;
00956       f_nbond[j].z += afz;
00957     }
00958     if (sumSlow)
00959     {
00960       double afx, afy, afz;
00961       convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
00962       f_slow[j].x += afx;
00963       f_slow[j].y += afy;
00964       f_slow[j].z += afz;
00965     }
00966   }
00967 
00968 }
00969 
00970 //
00971 // Finish all patches that are on this pe
00972 //
00973 void ComputeBondedCUDA::finishPatchesOnPe() {
00974 
00975   PatchMap* patchMap = PatchMap::Object();
00976   int myRank = CkMyRank();
00977 
00978   const int forceStride = bondedKernel.getForceStride(atomStorageSize);
00979   const int forceSize = bondedKernel.getForceSize(atomStorageSize);
00980   const bool sumNbond = hasModifiedExclusions;
00981   const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
00982 
00983   for (int i=0;i < patchIDsPerRank[myRank].size();i++) {
00984     PatchID patchID = patchIDsPerRank[myRank][i];
00985     Patch* patch = patchMap->patch(patchID);
00986     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00987     if (tpe == NULL) {
00988       NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
00989     }
00990 
00991     int pi = patchIndex[patchID];
00992     int numAtoms = patches[pi].numAtoms;
00993     int atomStart = patches[pi].atomStart;
00994 
00995     Force *f = tpe->f;
00996     Force *f_nbond = tpe->r->f[Results::nbond];
00997     Force *f_slow = tpe->r->f[Results::slow];
00998 
00999     FORCE_TYPE *af       = forces + atomStart;
01000     FORCE_TYPE *af_nbond = forces + forceSize + atomStart;
01001     FORCE_TYPE *af_slow  = forces + 2*forceSize + atomStart;
01002 
01003     if (!sumNbond && !sumSlow) {
01004       finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01005     } else if (sumNbond && !sumSlow) {
01006       finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01007     } else if (!sumNbond && sumSlow) {
01008       finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01009     } else if (sumNbond && sumSlow) {
01010       finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01011     } else {
01012       NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
01013     }
01014 
01015     // for (int j=0;j < numAtoms;j++) {
01016     //   {
01017     //     double afx, afy, afz;
01018     //     convertForceToDouble(af + j, forceStride, afx, afy, afz);
01019     //     f[j].x += afx;
01020     //     f[j].y += afy;
01021     //     f[j].z += afz;
01022     //   }
01023     //   if (sumNbond)
01024     //   {
01025     //     double afx, afy, afz;
01026     //     convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
01027     //     f_nbond[j].x += afx;
01028     //     f_nbond[j].y += afy;
01029     //     f_nbond[j].z += afz;
01030     //   }
01031     //   if (sumSlow)
01032     //   {
01033     //     double afx, afy, afz;
01034     //     convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
01035     //     f_slow[j].x += afx;
01036     //     f_slow[j].y += afy;
01037     //     f_slow[j].z += afz;
01038     //   }
01039     // }
01040 
01041     tpe->forceBox->close(&tpe->r);
01042     tpe->positionBox->close(&tpe->x);
01043     if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
01044   }
01045 
01046   bool done = false;
01047   CmiLock(lock);
01048   patchesCounter -= patchIDsPerRank[CkMyRank()].size();
01049   if (patchesCounter == 0) {
01050     patchesCounter = getNumPatches();
01051     done = true;
01052   }
01053   CmiUnlock(lock);
01054   if (done) {
01055     computeMgr->sendFinishReductions(masterPe, this);
01056   }
01057 
01058 }
01059 
01060 void ComputeBondedCUDA::finishPatches() {
01061 
01062   if (atomsChanged) {
01063     unmapAtoms();
01064   }
01065 
01066   computeMgr->sendFinishPatchesOnPe(pes, this);
01067 }
01068 
01069 #ifdef WRITE_FULL_VIRIALS
01070 #ifdef USE_FP_VIRIAL
01071 void convertVirial(double *virial) {
01072   long long int *virial_lli = (long long int *)virial;
01073   for (int i=0;i < 9;i++) {
01074     virial[i] = ((double)virial_lli[i])*virial_to_double;
01075   }
01076 }
01077 #endif
01078 #endif
01079 
01080 //
01081 // Finish & submit reductions
01082 //
01083 void ComputeBondedCUDA::finishReductions() {
01084 
01085   if (CkMyPe() != masterPe)
01086     NAMD_bug("ComputeBondedCUDA::finishReductions() called on non masterPe");
01087 
01088   // static int ncall = 0;
01089   // ncall++;
01090 
01091   int pos = 0;
01092   for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
01093     if (numTuplesPerType[tupletype] > 0) {
01094 
01095       if (doEnergy) {
01096         switch (tupletype) {
01097           case Tuples::BOND:
01098           reduction->item(REDUCTION_BOND_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND];
01099           break;
01100 
01101           case Tuples::ANGLE:
01102           reduction->item(REDUCTION_ANGLE_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE];
01103           break;
01104 
01105           case Tuples::DIHEDRAL:
01106           reduction->item(REDUCTION_DIHEDRAL_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL];
01107           break;
01108 
01109           case Tuples::IMPROPER:
01110           reduction->item(REDUCTION_IMPROPER_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER];
01111           break;
01112 
01113           case Tuples::EXCLUSION:
01114           reduction->item(REDUCTION_ELECT_ENERGY)      += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT];
01115           reduction->item(REDUCTION_LJ_ENERGY)         += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ];
01116           reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW];
01117           break;
01118 
01119           case Tuples::CROSSTERM:
01120           reduction->item(REDUCTION_CROSSTERM_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_CROSSTERM];
01121           break;
01122 
01123           default:
01124           NAMD_bug("ComputeBondedCUDA::finishReductions, Unsupported tuple type");
01125           break;
01126         }
01127       }
01128 
01129       auto it = tupleList[tupletype].begin();
01130       (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
01131     }
01132   }
01133 
01134   if (doVirial) {
01135 #ifdef WRITE_FULL_VIRIALS
01136 #ifdef USE_FP_VIRIAL
01137     convertVirial(&energies_virials[ComputeBondedCUDAKernel::normalVirialIndex_XX]);
01138     convertVirial(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX]);
01139     convertVirial(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX]);
01140     convertVirial(&energies_virials[ComputeBondedCUDAKernel::amdDiheVirialIndex_XX]);
01141 #endif
01142 #else
01143 #error "non-WRITE_FULL_VIRIALS not implemented"
01144 #endif
01145     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
01146     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
01147     ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW,  energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
01148     ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
01149     // NOTE: AMD_DIHE virial is also added to NORMAL virial.
01150     // This is what happens in ComputeDihedrals.C and ComputeCrossterms.C
01151     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,   energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
01152   }
01153 
01154   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
01155   reduction->submit();
01156 }
01157 
01158 //
01159 // Can only be called by master PE
01160 //
01161 void ComputeBondedCUDA::initialize() {
01162 
01163   if (CkMyPe() != masterPe)
01164     NAMD_bug("ComputeBondedCUDA::initialize() called on non master PE");
01165 
01166   // Build list of PEs
01167   for (int rank=0;rank < computes.size();rank++) {
01168     if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
01169       pes.push_back(CkNodeFirst(CkMyNode()) + rank);
01170     }
01171   }
01172 
01173   // Return if no work to do
01174   if (pes.size() == 0) return;
01175 
01176   initializeCalled = true;
01177   cudaCheck(cudaSetDevice(deviceID));
01178 
01179 #if CUDA_VERSION >= 5050
01180   int leastPriority, greatestPriority;
01181   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
01182   cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
01183 #else
01184   cudaCheck(cudaStreamCreate(&stream));
01185 #endif
01186   cudaCheck(cudaEventCreate(&forceDoneEvent));
01187   lock = CmiCreateLock();
01188 
01189   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
01190 
01191   PatchMap* patchMap = PatchMap::Object();
01192 
01193   // First, assign all patches in self computes.
01194   // NOTE: These never overlap between PEs. No proxies added.
01195   for (int rank=0;rank < computes.size();rank++) {
01196     std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
01197     for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
01198       for (auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
01199         if (!tuplePatchList.find( TuplePatchElem(*jt) ) ) {
01200           tuplePatchList.add( TuplePatchElem(*jt) );
01201           patchIDsPerRank[rank].push_back(*jt);
01202           allPatchIDs.push_back(*jt);
01203         }
01204       }
01205     }
01206   }
01207 
01208   // Second, assign all patches in home computes.
01209   // NOTE: The ranks always have these patches. No proxies added.
01210   for (int rank=0;rank < computes.size();rank++) {
01211     HomeCompute& homeCompute = computes[rank].homeCompute;
01212     std::vector<int>& patchIDs = homeCompute.patchIDs;
01213     for (int i=0;i < patchIDs.size();i++) {
01214       int patchID = patchIDs[i];
01215       if (!tuplePatchList.find( TuplePatchElem(patchID) ) ) {
01216         tuplePatchList.add( TuplePatchElem(patchID) );
01217         patchIDsPerRank[rank].push_back(patchID);
01218         allPatchIDs.push_back(patchID);
01219       }
01220     }
01221   }
01222 
01223   std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
01224   // Find neighbors that are not added yet
01225   std::vector<int> neighborPids;
01226   for (int rank=0;rank < computes.size();rank++) {
01227     PatchID neighbors[PatchMap::MaxOneOrTwoAway];
01228     HomeCompute& homeCompute = computes[rank].homeCompute;
01229     std::vector<int>& patchIDs = homeCompute.patchIDs;
01230     for (int i=0;i < patchIDs.size();i++) {
01231       int patchID = patchIDs[i];
01232       int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
01233       for (int j=0;j < numNeighbors;j++) {
01234         if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
01235           neighborPids.push_back(neighbors[j]);
01236         }
01237       }
01238     }
01239   }
01240   // Remove duplicates from neighborPids
01241   {
01242     std::sort(neighborPids.begin(), neighborPids.end());
01243     auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
01244     neighborPids.resize(std::distance(neighborPids.begin(), it_end));
01245   }
01246   // Assign neighbors to the PEs on this node that have them
01247   for (int i=0;i < neighborPids.size();i++) {
01248     for (int rank=0;rank < computes.size();rank++) {
01249       int pid = neighborPids[i];
01250       int pe = rank + CkNodeFirst(CkMyNode());
01251       if (patchMap->node(pid) == pe) {
01252         // Patch pid found on PE "pe" on this node
01253         tuplePatchList.add( TuplePatchElem(pid) );
01254         patchIDsPerRank[rank].push_back(pid);
01255         allPatchIDs.push_back(pid);
01256         // Add to this rank's patches
01257         patchIDsToAppend[rank].push_back(pid);
01258         // Add to the list of PEs
01259         pes.push_back(CkNodeFirst(CkMyNode()) + rank);
01260         break;
01261       }
01262     }
01263   }
01264   // Remove duplicates from pes
01265   {
01266     std::sort(pes.begin(), pes.end());
01267     auto it_end = std::unique(pes.begin(), pes.end());
01268     pes.resize(std::distance(pes.begin(), it_end));
01269   }
01270   
01271   // Last, assign all patches in neighbors of home computes
01272   // NOTE: Will create proxies on multiple nodes
01273   for (int rank=0;rank < computes.size();rank++) {
01274     PatchID neighbors[PatchMap::MaxOneOrTwoAway];
01275     HomeCompute& homeCompute = computes[rank].homeCompute;
01276     std::vector<int>& patchIDs = homeCompute.patchIDs;
01277     std::vector<int> neighborPatchIDs;
01278     for (int i=0;i < patchIDs.size();i++) {
01279       int patchID = patchIDs[i];
01280       int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
01281       for (int j=0;j < numNeighbors;j++) {
01282         if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
01283           // Patch not found => Add Proxy
01284           tuplePatchList.add( TuplePatchElem(neighbors[j]) );
01285           patchIDsPerRank[rank].push_back(neighbors[j]);
01286           allPatchIDs.push_back(neighbors[j]);
01287         }
01288         if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0 
01289           && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
01290           neighborPatchIDs.push_back(neighbors[j]);
01291         }
01292       }
01293     }
01294     // Append neighboring patchIDs to homeCompute.patchIDs
01295     // int start = patchIDs.size();
01296     // patchIDs.resize(patchIDs.size() + neighborPatchIDs.size());
01297     // for (int i=0;i < neighborPatchIDs.size();i++) {
01298     //   patchIDs[start + i] = neighborPatchIDs[i];
01299     // }
01300     for (int i=0;i < neighborPatchIDs.size();i++) {
01301       patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
01302     }
01303   }
01304 
01305   for (int rank=0;rank < patchIDsToAppend.size();rank++) {
01306     for (int i=0;i < patchIDsToAppend[rank].size();i++) {
01307       computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
01308     }
01309   }
01310 
01311   // Remove duplicate patch IDs
01312   {
01313     std::sort(allPatchIDs.begin(), allPatchIDs.end());
01314     auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
01315     allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
01316   }
01317 
01318   // Set number of (unique) patches
01319   setNumPatches(allPatchIDs.size());
01320 
01321   // Reset patchesCounter
01322   patchesCounter = getNumPatches();
01323 
01324   patches.resize(getNumPatches());
01325 
01326   // Setup tupleList
01327   for (int rank=0;rank < computes.size();rank++) {
01328     std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
01329     for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
01330       tupleList[it->tuples->getType()].push_back(it->tuples);
01331     }
01332     HomeCompute& homeCompute = computes[rank].homeCompute;
01333     for (int i=0;i < homeCompute.tuples.size();i++) {
01334       tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
01335     }
01336   }  
01337 
01338   // Allocate host memory for energies and virials
01339   allocate_host<double>(&energies_virials, ComputeBondedCUDAKernel::energies_virials_SIZE);
01340 
01341   // Finally, do sanity checks
01342   std::vector<char> patchIDset(patchMap->numPatches(), 0);
01343   int numPatchIDset = 0;
01344   int numPatchIDs = 0;
01345   for (int rank=0;rank < computes.size();rank++) {
01346     numPatchIDs += patchIDsPerRank[rank].size();
01347     for (int i=0;i < patchIDsPerRank[rank].size();i++) {
01348       PatchID patchID = patchIDsPerRank[rank][i];
01349       if (patchIDset[patchID] == 0) numPatchIDset++;
01350       patchIDset[patchID] = 1;
01351       if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
01352         NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
01353       }
01354     }
01355   }
01356   if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
01357     NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
01358   }
01359 
01360   // Warning: Direct indexing used, patchIndex could use up a lot of memory for large systems
01361   patchIndex.resize(patchMap->numPatches());
01362   atomMappers.resize(getNumPatches());
01363   for (int i=0;i < getNumPatches();i++) {
01364     atomMappers[i] = new AtomMapper(allPatchIDs[i], &atomMap);
01365     patchIndex[allPatchIDs[i]] = i;
01366   }
01367 
01368   // Copy coefficients to GPU
01369   Parameters* parameters = Node::Object()->parameters;
01370   for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
01371     if (tupleList[tupletype].size() > 0) {
01372       switch(tupletype) {
01373 
01374         case Tuples::BOND:
01375         {
01376           int NumBondParams = parameters->NumBondParams;
01377           BondValue* bond_array = parameters->bond_array;
01378           std::vector<CudaBondValue> bondValues(NumBondParams);
01379           for (int i=0;i < NumBondParams;i++) {
01380             bondValues[i].k  = bond_array[i].k;
01381             bondValues[i].x0 = bond_array[i].x0;
01382           }
01383           bondedKernel.setupBondValues(NumBondParams, bondValues.data());
01384         }
01385         break;
01386 
01387         case Tuples::ANGLE:
01388         {
01389           int NumAngleParams = parameters->NumAngleParams;
01390           AngleValue* angle_array = parameters->angle_array;
01391           std::vector<CudaAngleValue> angleValues(NumAngleParams);
01392           bool normal_ub_error = false;
01393           for (int i=0;i < NumAngleParams;i++) {
01394             angleValues[i].k      = angle_array[i].k;
01395             if (angle_array[i].normal == 1) {
01396               angleValues[i].theta0 = angle_array[i].theta0;
01397             } else {
01398               angleValues[i].theta0 = cos(angle_array[i].theta0);
01399             }
01400             normal_ub_error |= (angle_array[i].normal == 0 && angle_array[i].k_ub);
01401             angleValues[i].k_ub   = angle_array[i].k_ub;
01402             angleValues[i].r_ub   = angle_array[i].r_ub;
01403             angleValues[i].normal = angle_array[i].normal;
01404           }
01405           if (normal_ub_error) NAMD_die("ERROR: Can't use cosAngles with Urey-Bradley angles");
01406           bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
01407         }
01408         break;
01409 
01410         case Tuples::DIHEDRAL:
01411         {
01412           int NumDihedralParams = parameters->NumDihedralParams;
01413           DihedralValue* dihedral_array = parameters->dihedral_array;
01414           int NumDihedralParamsMult = 0;
01415           for (int i=0;i < NumDihedralParams;i++) {
01416             NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
01417           }
01418           std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
01419           dihedralMultMap.resize(NumDihedralParams);
01420           int k = 0;
01421           for (int i=0;i < NumDihedralParams;i++) {
01422             int multiplicity = dihedral_array[i].multiplicity;
01423             dihedralMultMap[i] = k;
01424             for (int j=0;j < multiplicity;j++) {
01425               dihedralValues[k].k     = dihedral_array[i].values[j].k;
01426               dihedralValues[k].n     = (dihedral_array[i].values[j].n << 1) | (j < (multiplicity - 1));
01427               dihedralValues[k].delta = dihedral_array[i].values[j].delta;
01428               k++;
01429             }
01430           }
01431           bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
01432         }
01433         break;
01434 
01435         case Tuples::IMPROPER:
01436         {
01437           int NumImproperParams = parameters->NumImproperParams;
01438           ImproperValue* improper_array = parameters->improper_array;
01439           int NumImproperParamsMult = 0;
01440           for (int i=0;i < NumImproperParams;i++) {
01441             NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
01442           }
01443           std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
01444           improperMultMap.resize(NumImproperParams);
01445           int k = 0;
01446           for (int i=0;i < NumImproperParams;i++) {
01447             int multiplicity = improper_array[i].multiplicity;
01448             improperMultMap[i] = k;
01449             for (int j=0;j < multiplicity;j++) {
01450               improperValues[k].k     = improper_array[i].values[j].k;
01451               improperValues[k].n     = (improper_array[i].values[j].n << 1) | (j < (multiplicity - 1));
01452               improperValues[k].delta = improper_array[i].values[j].delta;
01453               k++;
01454             }
01455           }
01456           bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
01457         }
01458         break;
01459 
01460         case Tuples::CROSSTERM:
01461         {
01462           int NumCrosstermParams = parameters->NumCrosstermParams;
01463           CrosstermValue* crossterm_array = parameters->crossterm_array;
01464           std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
01465           const int D = CrosstermValue::dim;
01466           const int N = CrosstermValue::dim - 1;
01467           for (int ipar=0;ipar < NumCrosstermParams;ipar++) {
01468             for (int i=0;i < N;i++) {
01469               for (int j=0;j < N;j++) {
01470 
01471                 // Setups coefficients for bi-cubic interpolation.
01472                 // See https://en.wikipedia.org/wiki/Bicubic_interpolation
01473 
01474                 #define INDEX(ncols,i,j)  ((i)*ncols + (j))
01475                 CrosstermData* table = &crossterm_array[ipar].c[0][0];
01476 
01477                 const double Ainv[16][16] = {
01478                   { 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01479                   { 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01480                   {-3,  3,  0,  0, -2, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01481                   { 2, -2,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01482                   { 0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0},
01483                   { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0},
01484                   { 0,  0,  0,  0,  0,  0,  0,  0, -3,  3,  0,  0, -2, -1,  0,  0},
01485                   { 0,  0,  0,  0,  0,  0,  0,  0,  2, -2,  0,  0,  1,  1,  0,  0},
01486                   {-3,  0,  3,  0,  0,  0,  0,  0, -2,  0, -1,  0,  0,  0,  0,  0},
01487                   { 0,  0,  0,  0, -3,  0,  3,  0,  0,  0,  0,  0, -2,  0, -1,  0},
01488                   { 9, -9, -9,  9,  6,  3, -6, -3,  6, -6,  3, -3,  4,  2,  2,  1},
01489                   {-6,  6,  6, -6, -3, -3,  3,  3, -4,  4, -2,  2, -2, -2, -1, -1},
01490                   { 2,  0, -2,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0},
01491                   { 0,  0,  0,  0,  2,  0, -2,  0,  0,  0,  0,  0,  1,  0,  1,  0},
01492                   {-6,  6,  6, -6, -4, -2,  4,  2, -3,  3, -3,  3, -2, -1, -2, -1},
01493                   { 4, -4, -4,  4,  2,  2, -2, -2,  2, -2,  2, -2,  1,  1,  1,  1}
01494                 };
01495 
01496 #ifndef M_PI
01497 #define M_PI 3.14159265358979323846
01498 #endif
01499 
01500                 const double h = M_PI/12.0;
01501 
01502                 const double x[16] = {
01503                   table[INDEX(D,i,j)].d00, table[INDEX(D,i+1,j)].d00, table[INDEX(D,i,j+1)].d00, table[INDEX(D,i+1,j+1)].d00,
01504                   table[INDEX(D,i,j)].d10*h, table[INDEX(D,i+1,j)].d10*h, table[INDEX(D,i,j+1)].d10*h, table[INDEX(D,i+1,j+1)].d10*h,
01505                   table[INDEX(D,i,j)].d01*h, table[INDEX(D,i+1,j)].d01*h, table[INDEX(D,i,j+1)].d01*h, table[INDEX(D,i+1,j+1)].d01*h,
01506                   table[INDEX(D,i,j)].d11*h*h, table[INDEX(D,i+1,j)].d11*h*h, table[INDEX(D,i,j+1)].d11*h*h, table[INDEX(D,i+1,j+1)].d11*h*h
01507                 };
01508 
01509                 // a = Ainv*x
01510                 float* a = (float *)&crosstermValues[ipar].c[i][j][0];
01511                 for (int k=0;k < 16;k++) {
01512                   double a_val = 0.0;
01513                   for (int l=0;l < 16;l++) {
01514                     a_val += Ainv[k][l]*x[l];
01515                   }
01516                   a[k] = (float)a_val;
01517                 }
01518 
01519               }
01520             }
01521           }
01522           bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
01523         }
01524         break;
01525 
01526         case Tuples::EXCLUSION:
01527         // Nothing to do
01528         break;
01529 
01530         default:
01531         NAMD_bug("ComputeBondedCUDA::initialize, Undefined tuple type");
01532         break;
01533       }
01534     }
01535   }
01536 
01537   computeMgr->sendAssignPatchesOnPe(pes, this);
01538 }
01539 
01540 #endif // BONDED_CUDA
01541 #endif // NAMD_CUDA

Generated on Fri Jun 22 01:17:12 2018 for NAMD by  doxygen 1.4.7