ComputeBondedCUDA.C

Go to the documentation of this file.
00001 #include "charm++.h"
00002 #include "NamdTypes.h"
00003 #include "ComputeMgr.h"
00004 #include "WorkDistrib.h"
00005 #include "ProxyMgr.h"
00006 #include "CudaUtils.h"
00007 #include "ComputeBonds.h"
00008 #include "ComputeAngles.h"
00009 #include "ComputeDihedrals.h"
00010 #include "ComputeImpropers.h"
00011 #include "ComputeCrossterms.h"
00012 #include "ComputeNonbondedCUDAExcl.h"
00013 #include "ComputeBondedCUDA.h"
00014 
00015 #ifdef NAMD_CUDA
00016 #ifdef BONDED_CUDA
00017 
00018 #include <algorithm>   // std::find
00019 
00020 const int ComputeBondedCUDA::CudaTupleTypeSize[Tuples::NUM_TUPLE_TYPES] = {
00021   sizeof(CudaBond),          // Bonds
00022   sizeof(CudaAngle),         // Angles
00023   sizeof(CudaDihedral),      // Dihedrals
00024   sizeof(CudaDihedral),      // Impropers
00025   sizeof(CudaExclusion),     // Exclusions
00026   sizeof(CudaCrossterm)      // Crossterms
00027 };
00028 
00029 extern "C" void CcdCallBacksReset(void *ignored, double curWallTime);  // fix Charm++
00030 
00031 //
00032 // Class constructor
00033 //
00034 ComputeBondedCUDA::ComputeBondedCUDA(ComputeID c, ComputeMgr* computeMgr, int deviceID,
00035   CudaNonbondedTables& cudaNonbondedTables) :
00036 Compute(c), computeMgr(computeMgr), deviceID(deviceID), masterPe(CkMyPe()),
00037 bondedKernel(deviceID, cudaNonbondedTables) 
00038 {
00039 
00040   computes.resize(CkMyNodeSize());
00041   patchIDsPerRank.resize(CkMyNodeSize());
00042   numExclPerRank.resize(CkMyNodeSize());
00043   for (int i=0;i < numExclPerRank.size();i++) {
00044     numExclPerRank[i].numModifiedExclusions = 0;
00045     numExclPerRank[i].numExclusions = 0;
00046   }
00047 
00048   atomMap.allocateMap(Node::Object()->molecule->numAtoms);
00049 
00050   flags = NULL;
00051 
00052   tupleData = NULL;
00053   tupleDataSize = 0;
00054 
00055   atoms = NULL;
00056   atomsSize = 0;
00057 
00058   forces = NULL;
00059   forcesSize = 0;
00060 
00061   energies_virials = NULL;
00062 
00063   initializeCalled = false;
00064 
00065   SimParameters *params = Node::Object()->simParameters;
00066   accelMDdoDihe = false;
00067   if (params->accelMDOn) {
00068      if (params->accelMDdihe || params->accelMDdual) accelMDdoDihe=true;
00069   }
00070 
00071 #define BONDED_KERNEL_EVENT 701
00072   traceRegisterUserEvent("CUDA BONDED KERNEL EVENT", BONDED_KERNEL_EVENT);
00073 }
00074 
00075 //
00076 // Class destructor
00077 //
00078 ComputeBondedCUDA::~ComputeBondedCUDA() {
00079   cudaCheck(cudaSetDevice(deviceID));
00080 
00081   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
00082   if (forces != NULL) deallocate_host<FORCE_TYPE>(&forces);
00083   if (energies_virials != NULL) deallocate_host<double>(&energies_virials);
00084   if (tupleData != NULL) deallocate_host<char>(&tupleData);
00085 
00086   if (initializeCalled) {
00087     cudaCheck(cudaStreamDestroy(stream));
00088     cudaCheck(cudaEventDestroy(forceDoneEvent));
00089     CmiDestroyLock(lock);
00090     delete reduction;
00091   }
00092 
00093   // NOTE: unregistering happens in [sync] -entry method
00094   computeMgr->sendUnregisterBoxesOnPe(pes, this);
00095 }
00096 
00097 void ComputeBondedCUDA::unregisterBoxesOnPe() {
00098   for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
00099     PatchID patchID = patchIDsPerRank[CkMyRank()][i];
00100     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00101     if (tpe == NULL || tpe->p == NULL) {
00102       NAMD_bug("ComputeBondedCUDA::unregisterBoxesOnPe, TuplePatchElem not found or setup incorrectly");
00103     }
00104     Patch* patch = tpe->p;
00105     if (tpe->positionBox != NULL) patch->unregisterPositionPickup(this, &tpe->positionBox);
00106     if (tpe->avgPositionBox != NULL) patch->unregisterAvgPositionPickup(this, &tpe->avgPositionBox);
00107     if (tpe->forceBox != NULL) patch->unregisterForceDeposit(this, &tpe->forceBox);
00108   }
00109 }
00110 
00111 //
00112 // Register compute for a given PE. pids is a list of patches the PE has
00113 // Called by master PE
00114 //
00115 void ComputeBondedCUDA::registerCompute(int pe, int type, PatchIDList& pids) {
00116 
00117   if (CkMyPe() != masterPe)
00118     NAMD_bug("ComputeBondedCUDA::registerCompute() called on non master PE");
00119 
00120   int rank = CkRankOf(pe);
00121 
00122   HomeCompute& homeCompute = computes[rank].homeCompute;
00123   if (homeCompute.patchIDs.size() == 0) {
00124     homeCompute.isBasePatch.resize(PatchMap::Object()->numPatches(), 0);
00125     homeCompute.patchIDs.resize(pids.size());
00126     for (int i=0;i < pids.size();i++) {
00127       homeCompute.patchIDs[i] = pids[i];
00128       homeCompute.isBasePatch[pids[i]] = 1;
00129     }
00130   } else {
00131     if (homeCompute.patchIDs.size() != pids.size()) {
00132       NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (1)");
00133     }
00134     for (int i=0;i < pids.size();i++) {
00135       if (homeCompute.patchIDs[i] != pids[i]) {
00136         NAMD_bug("ComputeBondedCUDA::registerCompute(), homeComputes, patch IDs do not match (2)");
00137       }
00138     }
00139   }
00140 
00141   switch(type) {
00142     case computeBondsType:
00143     homeCompute.tuples.push_back(new HomeTuples<BondElem, Bond, BondValue>(Tuples::BOND));
00144     break;
00145 
00146     case computeAnglesType:
00147     homeCompute.tuples.push_back(new HomeTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE));
00148     break;
00149 
00150     case computeDihedralsType:
00151     homeCompute.tuples.push_back(new HomeTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL));
00152     break;
00153 
00154     case computeImpropersType:
00155     homeCompute.tuples.push_back(new HomeTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER));
00156     break;
00157 
00158     case computeExclsType:
00159     homeCompute.tuples.push_back(new HomeTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION));
00160     break;
00161 
00162     case computeCrosstermsType:
00163     homeCompute.tuples.push_back(new HomeTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM));
00164     break;
00165 
00166     default:
00167     NAMD_bug("ComputeBondedCUDA::registerCompute(), Unsupported compute type");
00168     break;
00169   }
00170 
00171 }
00172 
00173 //
00174 // Register self compute for a given PE
00175 // Called by master PE
00176 //
00177 void ComputeBondedCUDA::registerSelfCompute(int pe, int type, int pid) {
00178 
00179   if (CkMyPe() != masterPe)
00180     NAMD_bug("ComputeBondedCUDA::registerSelfCompute() called on non master PE");
00181 
00182   int rank = CkRankOf(pe);
00183 
00184   std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
00185   auto it = find(selfComputes.begin(), selfComputes.end(), SelfCompute(type));
00186   if (it == selfComputes.end()) {
00187     // Type not found, add new one
00188     selfComputes.push_back(SelfCompute(type));
00189     it = selfComputes.begin() + (selfComputes.size() - 1);
00190 
00191     switch(type) {
00192       case computeSelfBondsType:
00193       it->tuples = new SelfTuples<BondElem, Bond, BondValue>(Tuples::BOND);
00194       break;
00195 
00196       case computeSelfAnglesType:
00197       it->tuples = new SelfTuples<AngleElem, Angle, AngleValue>(Tuples::ANGLE);
00198       break;
00199 
00200       case computeSelfDihedralsType:
00201       it->tuples = new SelfTuples<DihedralElem, Dihedral, DihedralValue>(Tuples::DIHEDRAL);
00202       break;
00203 
00204       case computeSelfImpropersType:
00205       it->tuples = new SelfTuples<ImproperElem, Improper, ImproperValue>(Tuples::IMPROPER);
00206       break;
00207 
00208       case computeSelfExclsType:
00209       it->tuples = new SelfTuples<ExclElem, Exclusion, int>(Tuples::EXCLUSION);
00210       break;
00211 
00212       case computeSelfCrosstermsType:
00213       it->tuples = new SelfTuples<CrosstermElem, Crossterm, CrosstermValue>(Tuples::CROSSTERM);
00214       break;
00215 
00216       default:
00217       NAMD_bug("ComputeBondedCUDA::registerSelfCompute(), Unsupported compute type");
00218       break;
00219     }
00220 
00221   }
00222 
00223   // Add patch ID for this type
00224   it->patchIDs.push_back(pid);
00225 }
00226 
00227 void ComputeBondedCUDA::assignPatchesOnPe() {
00228 
00229   PatchMap* patchMap = PatchMap::Object();
00230   for (int i=0;i < patchIDsPerRank[CkMyRank()].size();i++) {
00231     PatchID patchID = patchIDsPerRank[CkMyRank()][i];
00232     ProxyMgr::Object()->createProxy(patchID);
00233     Patch* patch = patchMap->patch(patchID);
00234     if (patch == NULL)
00235       NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, patch not found");
00236     if (flags == NULL) flags = &patchMap->patch(patchID)->flags;
00237     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00238     if (tpe == NULL) {
00239       NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem not found");
00240     }
00241     if (tpe->p != NULL) {
00242       NAMD_bug("ComputeBondedCUDA::assignPatchesOnPe, TuplePatchElem already registered");
00243     }
00244     // Assign patch and register coordinates and forces manually
00245     tpe->p = patch;
00246     tpe->positionBox = patch->registerPositionPickup(this);
00247     tpe->avgPositionBox = patch->registerAvgPositionPickup(this);
00248     tpe->forceBox = patch->registerForceDeposit(this);
00249   }
00250 }
00251 
00252 //
00253 // atomUpdate() can be called by any Pe
00254 //
00255 void ComputeBondedCUDA::atomUpdate() {
00256   atomsChangedIn = true;
00257 }
00258 
00259 //
00260 // Enqueu doWork on masterPe and return "no work"
00261 // Can be called by any Pe
00262 //
00263 int ComputeBondedCUDA::noWork() {
00264   computeMgr->sendMessageEnqueueWork(masterPe, this);
00265   return 1;
00266 }
00267 
00268 void ComputeBondedCUDA::messageEnqueueWork() {
00269   if (masterPe != CkMyPe())
00270     NAMD_bug("ComputeBondedCUDA::messageEnqueueWork() must be called from master PE");
00271   WorkDistrib::messageEnqueueWork(this);
00272 }
00273 
00274 //
00275 // Sends open-box commands to PEs
00276 // Called on master PE
00277 //
00278 void ComputeBondedCUDA::doWork() {
00279 
00280   if (CkMyPe() != masterPe)
00281     NAMD_bug("ComputeBondedCUDA::doWork() called on non master PE");
00282 
00283   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
00284   // atomsChangedIn can be set to true by any Pe
00285   // atomsChanged can only be set by masterPe
00286   // This use of double varibles makes sure we don't have race condition
00287   atomsChanged = atomsChangedIn;
00288   atomsChangedIn = false;
00289 
00290   if (getNumPatches() == 0) return;  // No work do to
00291 
00292   if (flags == NULL)
00293     NAMD_bug("ComputeBondedCUDA::doWork(), no flags set");
00294 
00295   // Read flags
00296   lattice = flags->lattice;
00297   doEnergy = flags->doEnergy;
00298   doVirial = flags->doVirial;
00299   doSlow   = flags->doFullElectrostatics;
00300   doMolly  = flags->doMolly;
00301 
00302   if (atomsChanged) {
00303     // Re-calculate patch atom numbers and storage
00304     updatePatches();
00305   }
00306 
00307   // Open boxes on Pes and launch work to masterPe
00308   computeMgr->sendOpenBoxesOnPe(pes, this);
00309 }
00310 
00311 //
00312 // This gets called when patch finishes on a PE
00313 //
00314 void ComputeBondedCUDA::patchReady(PatchID pid, int doneMigration, int seq) {
00315   if (doneMigration) {
00316     // auto it = patchIndex.find(pid);
00317     // if (it == patchIndex.end())
00318     //   NAMD_bug("ComputeBondedCUDA::patchReady, Patch ID not found");
00319     patches[patchIndex[pid]].numAtoms = PatchMap::Object()->patch(pid)->getNumAtoms();
00320   }
00321   CmiLock(lock);
00322   Compute::patchReady(pid, doneMigration, seq);
00323   CmiUnlock(lock);
00324 }
00325 
00326 //
00327 //
00328 //
00329 void ComputeBondedCUDA::updatePatches() {
00330   int atomStart = 0;
00331   for (int i=0;i < patches.size();i++) {
00332     patches[i].atomStart = atomStart;
00333     atomStart += patches[i].numAtoms;
00334   }
00335   atomStorageSize = atomStart;
00336 
00337   // Re-allocate atoms
00338   reallocate_host<CudaAtom>(&atoms, &atomsSize, atomStorageSize, 1.4f);
00339 }
00340 
00341 //
00342 // Map atoms GPU-wide
00343 //
00344 void ComputeBondedCUDA::mapAtoms() {
00345 
00346   for (int i=0;i < getNumPatches();i++) {
00347     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
00348     atomMappers[i]->registerIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
00349   }
00350 
00351 }
00352 
00353 //
00354 // Unmap atoms GPU-wide
00355 //
00356 void ComputeBondedCUDA::unmapAtoms() {
00357 
00358   for (int i=0;i < getNumPatches();i++) {
00359     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(allPatchIDs[i]));
00360     atomMappers[i]->unregisterIDsCompAtomExt(tpe->xExt, tpe->xExt + tpe->p->getNumAtoms());
00361   }
00362 
00363 }
00364 
00365 //
00366 // Open all patches that have been assigned to this Pe
00367 //
00368 void ComputeBondedCUDA::openBoxesOnPe() {
00369 
00370   std::vector<int>& patchIDs = patchIDsPerRank[CkMyRank()];
00371 
00372   for (auto it=patchIDs.begin();it != patchIDs.end();it++) {
00373     PatchID patchID = *it;
00374     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00375     tpe->x = tpe->positionBox->open();
00376     tpe->xExt = tpe->p->getCompAtomExtInfo();
00377     if ( doMolly ) tpe->x_avg = tpe->avgPositionBox->open();
00378     tpe->r = tpe->forceBox->open();
00379     tpe->f = tpe->r->f[Results::normal];
00380     if (accelMDdoDihe) tpe->af = tpe->r->f[Results::amdf]; // for dihedral-only or dual-boost accelMD
00381 
00382     // Copy atoms
00383     int pi = patchIndex[patchID];
00384     int atomStart = patches[pi].atomStart;
00385     int numAtoms = patches[pi].numAtoms;
00386     CompAtom* compAtom = tpe->x;
00387     const CompAtomExt *aExt = tpe->p->getCompAtomExtInfo();
00388     const CudaAtom *src = tpe->p->getCudaAtomList();
00389     for (int i=0;i < numAtoms;i++) {
00390       int j = aExt[i].sortOrder;
00391       atoms[atomStart + j] = src[i];
00392     }
00393 
00394   }
00395 
00396   bool done = false;
00397   CmiLock(lock);
00398   patchesCounter -= patchIDs.size();
00399   if (patchesCounter == 0) {
00400     patchesCounter = getNumPatches();
00401     done = true;
00402   }
00403   CmiUnlock(lock);
00404   if (done) {
00405     if (atomsChanged) {
00406       mapAtoms();
00407       computeMgr->sendLoadTuplesOnPe(pes, this);
00408     } else {
00409       computeMgr->sendLaunchWork(masterPe, this);
00410     }
00411   }
00412 }
00413 
00414 void countNumExclusions(Tuples* tuples, int& numModifiedExclusions, int& numExclusions) {
00415   numModifiedExclusions = 0;
00416   int ntuples = tuples->getNumTuples();
00417   ExclElem* src = (ExclElem *)(tuples->getTupleList());
00418   for (int ituple=0;ituple < ntuples;ituple++) {
00419     if (src[ituple].modified) numModifiedExclusions++;
00420   }
00421   numExclusions = ntuples - numModifiedExclusions;
00422 }
00423 
00424 //
00425 // Load tuples on PE. Note: this can only after boxes on all PEs have been opened
00426 //
00427 void ComputeBondedCUDA::loadTuplesOnPe() {
00428 
00429   int numModifiedExclusions = 0;
00430   int numExclusions = 0;
00431 
00432   std::vector< SelfCompute >& selfComputes = computes[CkMyRank()].selfComputes;
00433   // Loop over self compute types
00434   for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
00435     it->tuples->loadTuples(tuplePatchList, NULL, &atomMap, it->patchIDs);
00436     // For exclusions, we must count the number of modified and non-modified exclusions
00437     if (it->tuples->getType() == Tuples::EXCLUSION) {
00438       int tmp1, tmp2;
00439       countNumExclusions(it->tuples, tmp1, tmp2);
00440       numModifiedExclusions += tmp1;
00441       numExclusions += tmp2;
00442     }
00443   }
00444 
00445   HomeCompute& homeCompute = computes[CkMyRank()].homeCompute;
00446   for (int i=0;i < homeCompute.tuples.size();i++) {
00447     homeCompute.tuples[i]->loadTuples(tuplePatchList,
00448       homeCompute.isBasePatch.data(), &atomMap,
00449       homeCompute.patchIDs);
00450     // For exclusions, we must count the number of modified and non-modified exclusions
00451     if (homeCompute.tuples[i]->getType() == Tuples::EXCLUSION) {
00452       int tmp1, tmp2;
00453       countNumExclusions(homeCompute.tuples[i], tmp1, tmp2);
00454       numModifiedExclusions += tmp1;
00455       numExclusions += tmp2;
00456     }
00457   }
00458 
00459   // Store number of exclusions
00460   numExclPerRank[CkMyRank()].numModifiedExclusions = numModifiedExclusions;
00461   numExclPerRank[CkMyRank()].numExclusions         = numExclusions;
00462 
00463   bool done = false;
00464   CmiLock(lock);
00465   patchesCounter -= patchIDsPerRank[CkMyRank()].size();
00466   if (patchesCounter == 0) {
00467     patchesCounter = getNumPatches();
00468     done = true;
00469   }
00470   CmiUnlock(lock);
00471   if (done) {
00472     computeMgr->sendLaunchWork(masterPe, this);
00473   }
00474 }
00475 
00476 void ComputeBondedCUDA::copyBondData(const int ntuples, const BondElem* __restrict__ src,
00477   const BondValue* __restrict__ bond_array, CudaBond* __restrict__ dst) {
00478 
00479   PatchMap* patchMap = PatchMap::Object();
00480   for (int ituple=0;ituple < ntuples;ituple++) {
00481     CudaBond dstval;
00482     auto p0 = src[ituple].p[0];
00483     auto p1 = src[ituple].p[1];
00484     int pi0 = patchIndex[p0->patchID];
00485     int pi1 = patchIndex[p1->patchID];
00486     int l0 = src[ituple].localIndex[0];
00487     int l1 = src[ituple].localIndex[1];
00488     dstval.i = l0 + patches[pi0].atomStart;
00489     dstval.j = l1 + patches[pi1].atomStart;
00490     dstval.itype = (src[ituple].value - bond_array);
00491     Position position1 = p0->x[l0].position;
00492     Position position2 = p1->x[l1].position;
00493     Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
00494     shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00495     dstval.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
00496     dstval.scale = src[ituple].scale;
00497     dst[ituple] = dstval;
00498   }
00499 }
00500 
00501 void ComputeBondedCUDA::copyAngleData(const int ntuples, const AngleElem* __restrict__ src,
00502   const AngleValue* __restrict__ angle_array, CudaAngle* __restrict__ dst) {
00503 
00504   PatchMap* patchMap = PatchMap::Object();
00505   for (int ituple=0;ituple < ntuples;ituple++) {
00506     CudaAngle dstval;
00507     auto p0 = src[ituple].p[0];
00508     auto p1 = src[ituple].p[1];
00509     auto p2 = src[ituple].p[2];
00510     int pi0 = patchIndex[p0->patchID];
00511     int pi1 = patchIndex[p1->patchID];
00512     int pi2 = patchIndex[p2->patchID];
00513     int l0 = src[ituple].localIndex[0];
00514     int l1 = src[ituple].localIndex[1];
00515     int l2 = src[ituple].localIndex[2];
00516     dstval.i = l0 + patches[pi0].atomStart;
00517     dstval.j = l1 + patches[pi1].atomStart;
00518     dstval.k = l2 + patches[pi2].atomStart;
00519     dstval.itype = (src[ituple].value - angle_array);
00520     Position position1 = p0->x[l0].position;
00521     Position position2 = p1->x[l1].position;
00522     Position position3 = p2->x[l2].position;
00523     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
00524     Vector shiftVec32 = lattice.wrap_delta_scaled(position3, position2);
00525     shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00526     shiftVec32 += patchMap->center(p2->patchID) - patchMap->center(p1->patchID);
00527     dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
00528     dstval.koffsetXYZ = make_float3((float)shiftVec32.x, (float)shiftVec32.y, (float)shiftVec32.z);
00529     dstval.scale = src[ituple].scale;
00530     dst[ituple] = dstval;
00531   }
00532 }
00533 
00534 //
00535 // Used for both dihedrals and impropers
00536 //
00537 template <bool doDihedral, typename T, typename P>
00538 void ComputeBondedCUDA::copyDihedralData(const int ntuples, const T* __restrict__ src,
00539   const P* __restrict__ p_array, CudaDihedral* __restrict__ dst) {
00540 
00541   PatchMap* patchMap = PatchMap::Object();
00542   for (int ituple=0;ituple < ntuples;ituple++) {
00543     CudaDihedral dstval;
00544     auto p0 = src[ituple].p[0];
00545     auto p1 = src[ituple].p[1];
00546     auto p2 = src[ituple].p[2];
00547     auto p3 = src[ituple].p[3];
00548     int pi0 = patchIndex[p0->patchID];
00549     int pi1 = patchIndex[p1->patchID];
00550     int pi2 = patchIndex[p2->patchID];
00551     int pi3 = patchIndex[p3->patchID];
00552     int l0 = src[ituple].localIndex[0];
00553     int l1 = src[ituple].localIndex[1];
00554     int l2 = src[ituple].localIndex[2];
00555     int l3 = src[ituple].localIndex[3];
00556     dstval.i = l0 + patches[pi0].atomStart;
00557     dstval.j = l1 + patches[pi1].atomStart;
00558     dstval.k = l2 + patches[pi2].atomStart;
00559     dstval.l = l3 + patches[pi3].atomStart;
00560     if (doDihedral) {
00561       dstval.itype = dihedralMultMap[(src[ituple].value - p_array)];
00562     } else {
00563       dstval.itype = improperMultMap[(src[ituple].value - p_array)];
00564     }
00565     Position position1 = p0->x[l0].position;
00566     Position position2 = p1->x[l1].position;
00567     Position position3 = p2->x[l2].position;
00568     Position position4 = p3->x[l3].position;
00569     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
00570     Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
00571     Vector shiftVec43 = lattice.wrap_delta_scaled(position4, position3);
00572     shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00573     shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
00574     shiftVec43 += patchMap->center(p3->patchID) - patchMap->center(p2->patchID);
00575     dstval.ioffsetXYZ = make_float3((float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
00576     dstval.joffsetXYZ = make_float3((float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
00577     dstval.loffsetXYZ = make_float3((float)shiftVec43.x, (float)shiftVec43.y, (float)shiftVec43.z);
00578     dstval.scale = src[ituple].scale;
00579     dst[ituple] = dstval;
00580   }
00581 }
00582 
00583 void ComputeBondedCUDA::copyExclusionData(const int ntuples, const ExclElem* __restrict__ src, const int typeSize,
00584   CudaExclusion* __restrict__ dst1, CudaExclusion* __restrict__ dst2, int& pos, int& pos2) {
00585 
00586   PatchMap* patchMap = PatchMap::Object();
00587   for (int ituple=0;ituple < ntuples;ituple++) {
00588     auto p0 = src[ituple].p[0];
00589     auto p1 = src[ituple].p[1];
00590     int pi0 = patchIndex[p0->patchID];
00591     int pi1 = patchIndex[p1->patchID];
00592     int l0 = src[ituple].localIndex[0];
00593     int l1 = src[ituple].localIndex[1];
00594     CompAtom& ca1 = p0->x[l0];
00595     CompAtom& ca2 = p1->x[l1];
00596     Position position1 = ca1.position;
00597     Position position2 = ca2.position;
00598     Vector shiftVec = lattice.wrap_delta_scaled(position1, position2);
00599     shiftVec += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00600     CudaExclusion ce;
00601     ce.i            = l0 + patches[pi0].atomStart;
00602     ce.j            = l1 + patches[pi1].atomStart;
00603     ce.vdwtypei    = ca1.vdwType;
00604     ce.vdwtypej    = ca2.vdwType;
00605     ce.ioffsetXYZ = make_float3((float)shiftVec.x, (float)shiftVec.y, (float)shiftVec.z);
00606     //
00607     if (src[ituple].modified) {
00608       *dst1 = ce;
00609       dst1++;
00610       pos += typeSize;
00611     } else {
00612       *dst2 = ce;
00613       dst2++;
00614       pos2 += typeSize;
00615     }
00616   }
00617 }
00618 
00619 void ComputeBondedCUDA::copyCrosstermData(const int ntuples, const CrosstermElem* __restrict__ src,
00620   const CrosstermValue* __restrict__ crossterm_array, CudaCrossterm* __restrict__ dst) {
00621 
00622   PatchMap* patchMap = PatchMap::Object();
00623   for (int ituple=0;ituple < ntuples;ituple++) {
00624     auto p0 = src[ituple].p[0];
00625     auto p1 = src[ituple].p[1];
00626     auto p2 = src[ituple].p[2];
00627     auto p3 = src[ituple].p[3];
00628     auto p4 = src[ituple].p[4];
00629     auto p5 = src[ituple].p[5];
00630     auto p6 = src[ituple].p[6];
00631     auto p7 = src[ituple].p[7];
00632     int pi0 = patchIndex[p0->patchID];
00633     int pi1 = patchIndex[p1->patchID];
00634     int pi2 = patchIndex[p2->patchID];
00635     int pi3 = patchIndex[p3->patchID];
00636     int pi4 = patchIndex[p4->patchID];
00637     int pi5 = patchIndex[p5->patchID];
00638     int pi6 = patchIndex[p6->patchID];
00639     int pi7 = patchIndex[p7->patchID];
00640     int l0 = src[ituple].localIndex[0];
00641     int l1 = src[ituple].localIndex[1];
00642     int l2 = src[ituple].localIndex[2];
00643     int l3 = src[ituple].localIndex[3];
00644     int l4 = src[ituple].localIndex[4];
00645     int l5 = src[ituple].localIndex[5];
00646     int l6 = src[ituple].localIndex[6];
00647     int l7 = src[ituple].localIndex[7];
00648     dst[ituple].i1 = l0 + patches[pi0].atomStart;
00649     dst[ituple].i2 = l1 + patches[pi1].atomStart;
00650     dst[ituple].i3 = l2 + patches[pi2].atomStart;
00651     dst[ituple].i4 = l3 + patches[pi3].atomStart;
00652     dst[ituple].i5 = l4 + patches[pi4].atomStart;
00653     dst[ituple].i6 = l5 + patches[pi5].atomStart;
00654     dst[ituple].i7 = l6 + patches[pi6].atomStart;
00655     dst[ituple].i8 = l7 + patches[pi7].atomStart;
00656     dst[ituple].itype = (src[ituple].value - crossterm_array);
00657     Position position1 = p0->x[l0].position;
00658     Position position2 = p1->x[l1].position;
00659     Position position3 = p2->x[l2].position;
00660     Position position4 = p3->x[l3].position;
00661     Position position5 = p4->x[l4].position;
00662     Position position6 = p5->x[l5].position;
00663     Position position7 = p6->x[l6].position;
00664     Position position8 = p7->x[l7].position;
00665     Vector shiftVec12 = lattice.wrap_delta_scaled(position1, position2);
00666     Vector shiftVec23 = lattice.wrap_delta_scaled(position2, position3);
00667     Vector shiftVec34 = lattice.wrap_delta_scaled(position3, position4);
00668     Vector shiftVec56 = lattice.wrap_delta_scaled(position5, position6);
00669     Vector shiftVec67 = lattice.wrap_delta_scaled(position6, position7);
00670     Vector shiftVec78 = lattice.wrap_delta_scaled(position7, position8);
00671     shiftVec12 += patchMap->center(p0->patchID) - patchMap->center(p1->patchID);
00672     shiftVec23 += patchMap->center(p1->patchID) - patchMap->center(p2->patchID);
00673     shiftVec34 += patchMap->center(p2->patchID) - patchMap->center(p3->patchID);
00674     shiftVec56 += patchMap->center(p4->patchID) - patchMap->center(p5->patchID);
00675     shiftVec67 += patchMap->center(p5->patchID) - patchMap->center(p6->patchID);
00676     shiftVec78 += patchMap->center(p6->patchID) - patchMap->center(p7->patchID);
00677     dst[ituple].offset12XYZ = make_float3( (float)shiftVec12.x, (float)shiftVec12.y, (float)shiftVec12.z);
00678     dst[ituple].offset23XYZ = make_float3( (float)shiftVec23.x, (float)shiftVec23.y, (float)shiftVec23.z);
00679     dst[ituple].offset34XYZ = make_float3( (float)shiftVec34.x, (float)shiftVec34.y, (float)shiftVec34.z);
00680     dst[ituple].offset56XYZ = make_float3( (float)shiftVec56.x, (float)shiftVec56.y, (float)shiftVec56.z);
00681     dst[ituple].offset67XYZ = make_float3( (float)shiftVec67.x, (float)shiftVec67.y, (float)shiftVec67.z);
00682     dst[ituple].offset78XYZ = make_float3( (float)shiftVec78.x, (float)shiftVec78.y, (float)shiftVec78.z);
00683     dst[ituple].scale = src[ituple].scale;
00684   }
00685 }
00686 
00687 void ComputeBondedCUDA::tupleCopyWorker(int first, int last, void *result, int paraNum, void *param) {
00688   ComputeBondedCUDA* c = (ComputeBondedCUDA *)param;
00689   c->tupleCopyWorker(first, last);
00690 }
00691 
00692 void ComputeBondedCUDA::tupleCopyWorker(int first, int last) {
00693   if (first == -1) {
00694     // Separate exclusions into modified, and non-modified
00695     int pos = exclusionStartPos;
00696     int pos2 = exclusionStartPos2;
00697     for (auto it = tupleList[Tuples::EXCLUSION].begin();it != tupleList[Tuples::EXCLUSION].end();it++) {
00698       int ntuples = (*it)->getNumTuples();
00699       copyExclusionData(ntuples, (ExclElem *)(*it)->getTupleList(), CudaTupleTypeSize[Tuples::EXCLUSION],
00700         (CudaExclusion *)&tupleData[pos], (CudaExclusion *)&tupleData[pos2], pos, pos2);
00701     }
00702     first = 0;
00703   }
00704   for (int i=first;i <= last;i++) {
00705     switch (tupleCopyWorkList[i].tupletype) {
00706 
00707       case Tuples::BOND:
00708       {
00709         copyBondData(tupleCopyWorkList[i].ntuples, (BondElem *)tupleCopyWorkList[i].tupleElemList,
00710           Node::Object()->parameters->bond_array, (CudaBond *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00711       }
00712       break;
00713 
00714       case Tuples::ANGLE:
00715       {
00716         copyAngleData(tupleCopyWorkList[i].ntuples, (AngleElem *)tupleCopyWorkList[i].tupleElemList,
00717           Node::Object()->parameters->angle_array, (CudaAngle *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00718       }
00719       break;
00720 
00721       case Tuples::DIHEDRAL:
00722       {
00723         copyDihedralData<true, DihedralElem, DihedralValue>(tupleCopyWorkList[i].ntuples,
00724           (DihedralElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->dihedral_array,
00725           (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00726       }
00727       break;
00728 
00729       case Tuples::IMPROPER:
00730       {
00731         copyDihedralData<false, ImproperElem, ImproperValue>(tupleCopyWorkList[i].ntuples,
00732           (ImproperElem *)tupleCopyWorkList[i].tupleElemList, Node::Object()->parameters->improper_array,
00733           (CudaDihedral *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00734       }
00735       break;
00736 
00737       case Tuples::CROSSTERM:
00738       {
00739         copyCrosstermData(tupleCopyWorkList[i].ntuples, (CrosstermElem *)tupleCopyWorkList[i].tupleElemList,
00740           Node::Object()->parameters->crossterm_array, (CudaCrossterm *)&tupleData[tupleCopyWorkList[i].tupleDataPos]);
00741       }
00742       break;
00743 
00744       default:
00745       NAMD_bug("ComputeBondedCUDA::tupleCopyWorker, Unsupported tuple type");
00746       break;
00747     }
00748   }
00749 }
00750 
00751 //
00752 // Copies tuple data form individual buffers to a single contigious buffer
00753 // NOTE: This is done on the master PE
00754 //
00755 void ComputeBondedCUDA::copyTupleData() {
00756 
00757   PatchMap* patchMap = PatchMap::Object();
00758 
00759   // Count the number of exclusions
00760   int numModifiedExclusions = 0;
00761   int numExclusions = 0;
00762   for (int i=0;i < numExclPerRank.size();i++) {
00763     numModifiedExclusions += numExclPerRank[i].numModifiedExclusions;
00764     numExclusions         += numExclPerRank[i].numExclusions;
00765   }
00766   int numModifiedExclusionsWA = ComputeBondedCUDAKernel::warpAlign(numModifiedExclusions);
00767   int numExclusionsWA         = ComputeBondedCUDAKernel::warpAlign(numExclusions);
00768 
00769   // Count the number of tuples for each type
00770   int posWA = 0;
00771   exclusionStartPos = 0;
00772   exclusionStartPos2 = 0;
00773   tupleCopyWorkList.clear();
00774   for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
00775     // Take temporary position
00776     int pos = posWA;
00777     if (tupletype == Tuples::EXCLUSION) {
00778       exclusionStartPos = pos;
00779       exclusionStartPos2 = pos + numModifiedExclusionsWA*CudaTupleTypeSize[Tuples::EXCLUSION];
00780     }
00781     // Count for total number of tuples for this tupletype
00782     int num = 0;
00783     for (auto it = tupleList[tupletype].begin();it != tupleList[tupletype].end();it++) {
00784       int ntuples = (*it)->getNumTuples();
00785       num += ntuples;
00786       if (tupletype != Tuples::EXCLUSION) {
00787         TupleCopyWork tupleCopyWork;
00788         tupleCopyWork.tupletype     = tupletype;
00789         tupleCopyWork.ntuples       = ntuples;
00790         tupleCopyWork.tupleElemList = (*it)->getTupleList();
00791         tupleCopyWork.tupleDataPos  = pos;
00792         tupleCopyWorkList.push_back(tupleCopyWork);
00793         pos += ntuples*CudaTupleTypeSize[tupletype];
00794       }
00795     }
00796     numTuplesPerType[tupletype] = num;
00797     //
00798     if (tupletype == Tuples::EXCLUSION) {
00799       // Warp-align exclusions separately
00800       posWA += (numModifiedExclusionsWA + numExclusionsWA)*CudaTupleTypeSize[tupletype];
00801     } else {
00802       posWA += ComputeBondedCUDAKernel::warpAlign(num)*CudaTupleTypeSize[tupletype];
00803     }
00804   }
00805   if (numModifiedExclusions + numExclusions != numTuplesPerType[Tuples::EXCLUSION]) {
00806     NAMD_bug("ComputeBondedCUDA::copyTupleData, invalid number of exclusions");
00807   }
00808 
00809   // Set flags for finishPatchesOnPe
00810   hasExclusions = (numExclusions > 0);
00811   hasModifiedExclusions = (numModifiedExclusions > 0);
00812 
00813   // Re-allocate storage as needed
00814   // reallocate_host<char>(&tupleData, &tupleDataSize, size, 1.2f);
00815   reallocate_host<char>(&tupleData, &tupleDataSize, posWA, 1.2f);
00816 
00817 #if CMK_SMP && USE_CKLOOP
00818   int useCkLoop = Node::Object()->simParameters->useCkLoop;
00819   if (useCkLoop >= 1) {
00820     CkLoop_Parallelize(tupleCopyWorker, 1, (void *)this, CkMyNodeSize(), -1, tupleCopyWorkList.size() - 1);
00821   } else
00822 #endif
00823   {
00824     tupleCopyWorker(-1, tupleCopyWorkList.size() - 1);
00825   }
00826 
00827   bondedKernel.update(numTuplesPerType[Tuples::BOND], numTuplesPerType[Tuples::ANGLE],
00828     numTuplesPerType[Tuples::DIHEDRAL], numTuplesPerType[Tuples::IMPROPER],
00829     numModifiedExclusions, numExclusions, numTuplesPerType[Tuples::CROSSTERM],
00830     tupleData, stream);
00831 
00832   // Re-allocate forces
00833   int forceStorageSize = bondedKernel.getAllForceSize(atomStorageSize, true);
00834   reallocate_host<FORCE_TYPE>(&forces, &forcesSize, forceStorageSize, 1.4f);
00835 }
00836 
00837 //
00838 // Launch work on GPU
00839 //
00840 void ComputeBondedCUDA::launchWork() {
00841   if (CkMyPe() != masterPe)
00842     NAMD_bug("ComputeBondedCUDA::launchWork() called on non master PE");
00843 
00844   cudaCheck(cudaSetDevice(deviceID));
00845 
00846   if (atomsChanged) {
00847     copyTupleData();
00848   }
00849 
00850   float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z);
00851   float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z);
00852   float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z);
00853 
00854   int r2_delta_expc = 64 * (ComputeNonbondedUtil::r2_delta_exp - 127);
00855 
00856   // Calculate forces
00857   bondedKernel.bondedForce(
00858     ComputeNonbondedUtil::scale14,
00859     atomStorageSize,
00860     doEnergy, doVirial, doSlow,
00861     lata, latb, latc,
00862     (float)ComputeNonbondedUtil::cutoff2,
00863     (float)ComputeNonbondedUtil::r2_delta, r2_delta_expc,
00864     (const float4*)atoms, forces,
00865     energies_virials,
00866     stream);
00867 
00868   forceDoneSetCallback();
00869 }
00870 
00871 void ComputeBondedCUDA::forceDoneCheck(void *arg, double walltime) {
00872   ComputeBondedCUDA* c = (ComputeBondedCUDA *)arg;
00873 
00874   if (CkMyPe() != c->masterPe)
00875     NAMD_bug("ComputeBondedCUDA::forceDoneCheck called on non masterPe");
00876 
00877   cudaCheck(cudaSetDevice(c->deviceID));
00878 
00879   cudaError_t err = cudaEventQuery(c->forceDoneEvent);
00880   if (err == cudaSuccess) {
00881     // Event has occurred
00882     c->checkCount = 0;
00883     traceUserBracketEvent(BONDED_KERNEL_EVENT, c->beforeForceCompute, CkWallTimer());
00884     c->finishPatches();
00885     return;
00886   } else if (err != cudaErrorNotReady) {
00887     // Anything else is an error
00888     NAMD_bug("ComputeBondedCUDA::forceDoneCheck, cudaEventQuery returned error");
00889   }
00890 
00891   // Event has not occurred
00892   c->checkCount++;
00893   if (c->checkCount >= 1000000) {
00894     NAMD_bug("ComputeBondedCUDA::forceDoneCheck, check count exceeded");
00895   }
00896 
00897   // Call again 
00898   CcdCallBacksReset(0, walltime);
00899   CcdCallFnAfter(forceDoneCheck, arg, 0.1);
00900 }
00901 
00902 //
00903 // Set call back for all the work in the stream at this point
00904 //
00905 void ComputeBondedCUDA::forceDoneSetCallback() {
00906   if (CkMyPe() != masterPe)
00907     NAMD_bug("ComputeBondedCUDA::forceDoneSetCallback called on non masterPe");
00908   cudaCheck(cudaSetDevice(deviceID));
00909   cudaCheck(cudaEventRecord(forceDoneEvent, stream));
00910   checkCount = 0;
00911   CcdCallBacksReset(0, CmiWallTimer());
00912   // Start timer for CUDA kernel
00913   beforeForceCompute = CkWallTimer();
00914   // Set the call back at 0.1ms
00915   CcdCallFnAfter(forceDoneCheck, this, 0.1);
00916 }
00917 
00918 inline void convertForceToDouble(const FORCE_TYPE *af, const int forceStride, double& afx, double& afy, double& afz) {
00919 #ifdef USE_STRIDED_FORCE
00920   FORCE_TYPE afxt = af[0];
00921   FORCE_TYPE afyt = af[forceStride];
00922   FORCE_TYPE afzt = af[forceStride*2];
00923 #else
00924   FORCE_TYPE afxt = af->x;
00925   FORCE_TYPE afyt = af->y;
00926   FORCE_TYPE afzt = af->z;
00927 #endif
00928 #ifdef USE_FP_FORCE
00929   afx = ((double)afxt)*force_to_double;
00930   afy = ((double)afyt)*force_to_double;
00931   afz = ((double)afzt)*force_to_double;
00932 #else
00933   afx = afxt;
00934   afy = afyt;
00935   afz = afzt;
00936 #endif
00937 }
00938 
00939 template <bool sumNbond, bool sumSlow>
00940 void finishForceLoop(const int numAtoms, const int forceStride,
00941   const FORCE_TYPE* __restrict__ af, const FORCE_TYPE* __restrict__ af_nbond, const FORCE_TYPE* __restrict__ af_slow, 
00942   Force* __restrict__ f, Force* __restrict__ f_nbond, Force* __restrict__ f_slow) {
00943 
00944   for (int j=0;j < numAtoms;j++) {
00945     {
00946       double afx, afy, afz;
00947       convertForceToDouble(af + j, forceStride, afx, afy, afz);
00948       f[j].x += afx;
00949       f[j].y += afy;
00950       f[j].z += afz;
00951     }
00952     if (sumNbond)
00953     {
00954       double afx, afy, afz;
00955       convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
00956       f_nbond[j].x += afx;
00957       f_nbond[j].y += afy;
00958       f_nbond[j].z += afz;
00959     }
00960     if (sumSlow)
00961     {
00962       double afx, afy, afz;
00963       convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
00964       f_slow[j].x += afx;
00965       f_slow[j].y += afy;
00966       f_slow[j].z += afz;
00967     }
00968   }
00969 
00970 }
00971 
00972 //
00973 // Finish all patches that are on this pe
00974 //
00975 void ComputeBondedCUDA::finishPatchesOnPe() {
00976 
00977   PatchMap* patchMap = PatchMap::Object();
00978   int myRank = CkMyRank();
00979 
00980   const int forceStride = bondedKernel.getForceStride(atomStorageSize);
00981   const int forceSize = bondedKernel.getForceSize(atomStorageSize);
00982   const bool sumNbond = hasModifiedExclusions;
00983   const bool sumSlow = (hasModifiedExclusions || hasExclusions) && doSlow;
00984 
00985   for (int i=0;i < patchIDsPerRank[myRank].size();i++) {
00986     PatchID patchID = patchIDsPerRank[myRank][i];
00987     Patch* patch = patchMap->patch(patchID);
00988     TuplePatchElem* tpe = tuplePatchList.find(TuplePatchElem(patchID));
00989     if (tpe == NULL) {
00990       NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, TuplePatchElem not found");
00991     }
00992 
00993     int pi = patchIndex[patchID];
00994     int numAtoms = patches[pi].numAtoms;
00995     int atomStart = patches[pi].atomStart;
00996 
00997     Force *f = tpe->f;
00998     Force *f_nbond = tpe->r->f[Results::nbond];
00999     Force *f_slow = tpe->r->f[Results::slow];
01000 
01001     FORCE_TYPE *af       = forces + atomStart;
01002     FORCE_TYPE *af_nbond = forces + forceSize + atomStart;
01003     FORCE_TYPE *af_slow  = forces + 2*forceSize + atomStart;
01004 
01005     if (!sumNbond && !sumSlow) {
01006       finishForceLoop<false, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01007     } else if (sumNbond && !sumSlow) {
01008       finishForceLoop<true, false>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01009     } else if (!sumNbond && sumSlow) {
01010       finishForceLoop<false, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01011     } else if (sumNbond && sumSlow) {
01012       finishForceLoop<true, true>(numAtoms, forceStride, af, af_nbond, af_slow, f, f_nbond, f_slow);
01013     } else {
01014       NAMD_bug("ComputeBondedCUDA::finishPatchesOnPe, logically impossible choice");
01015     }
01016 
01017     // for (int j=0;j < numAtoms;j++) {
01018     //   {
01019     //     double afx, afy, afz;
01020     //     convertForceToDouble(af + j, forceStride, afx, afy, afz);
01021     //     f[j].x += afx;
01022     //     f[j].y += afy;
01023     //     f[j].z += afz;
01024     //   }
01025     //   if (sumNbond)
01026     //   {
01027     //     double afx, afy, afz;
01028     //     convertForceToDouble(af_nbond + j, forceStride, afx, afy, afz);
01029     //     f_nbond[j].x += afx;
01030     //     f_nbond[j].y += afy;
01031     //     f_nbond[j].z += afz;
01032     //   }
01033     //   if (sumSlow)
01034     //   {
01035     //     double afx, afy, afz;
01036     //     convertForceToDouble(af_slow + j, forceStride, afx, afy, afz);
01037     //     f_slow[j].x += afx;
01038     //     f_slow[j].y += afy;
01039     //     f_slow[j].z += afz;
01040     //   }
01041     // }
01042 
01043     tpe->forceBox->close(&tpe->r);
01044     tpe->positionBox->close(&tpe->x);
01045     if ( doMolly ) tpe->avgPositionBox->close(&tpe->x_avg);
01046   }
01047 
01048   bool done = false;
01049   CmiLock(lock);
01050   patchesCounter -= patchIDsPerRank[CkMyRank()].size();
01051   if (patchesCounter == 0) {
01052     patchesCounter = getNumPatches();
01053     done = true;
01054   }
01055   CmiUnlock(lock);
01056   if (done) {
01057     computeMgr->sendFinishReductions(masterPe, this);
01058   }
01059 
01060 }
01061 
01062 void ComputeBondedCUDA::finishPatches() {
01063 
01064   if (atomsChanged) {
01065     unmapAtoms();
01066   }
01067 
01068   computeMgr->sendFinishPatchesOnPe(pes, this);
01069 }
01070 
01071 #ifdef WRITE_FULL_VIRIALS
01072 #ifdef USE_FP_VIRIAL
01073 void convertVirial(double *virial) {
01074   long long int *virial_lli = (long long int *)virial;
01075   for (int i=0;i < 9;i++) {
01076     virial[i] = ((double)virial_lli[i])*virial_to_double;
01077   }
01078 }
01079 #endif
01080 #endif
01081 
01082 //
01083 // Finish & submit reductions
01084 //
01085 void ComputeBondedCUDA::finishReductions() {
01086 
01087   if (CkMyPe() != masterPe)
01088     NAMD_bug("ComputeBondedCUDA::finishReductions() called on non masterPe");
01089 
01090   // static int ncall = 0;
01091   // ncall++;
01092 
01093   int pos = 0;
01094   for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
01095     if (numTuplesPerType[tupletype] > 0) {
01096 
01097       if (doEnergy) {
01098         switch (tupletype) {
01099           case Tuples::BOND:
01100           reduction->item(REDUCTION_BOND_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_BOND];
01101           break;
01102 
01103           case Tuples::ANGLE:
01104           reduction->item(REDUCTION_ANGLE_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ANGLE];
01105           break;
01106 
01107           case Tuples::DIHEDRAL:
01108           reduction->item(REDUCTION_DIHEDRAL_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_DIHEDRAL];
01109           break;
01110 
01111           case Tuples::IMPROPER:
01112           reduction->item(REDUCTION_IMPROPER_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_IMPROPER];
01113           break;
01114 
01115           case Tuples::EXCLUSION:
01116           reduction->item(REDUCTION_ELECT_ENERGY)      += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT];
01117           reduction->item(REDUCTION_LJ_ENERGY)         += energies_virials[ComputeBondedCUDAKernel::energyIndex_LJ];
01118           reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energies_virials[ComputeBondedCUDAKernel::energyIndex_ELECT_SLOW];
01119           break;
01120 
01121           case Tuples::CROSSTERM:
01122           reduction->item(REDUCTION_CROSSTERM_ENERGY) += energies_virials[ComputeBondedCUDAKernel::energyIndex_CROSSTERM];
01123           break;
01124 
01125           default:
01126           NAMD_bug("ComputeBondedCUDA::finishReductions, Unsupported tuple type");
01127           break;
01128         }
01129       }
01130 
01131       auto it = tupleList[tupletype].begin();
01132       (*it)->submitTupleCount(reduction, numTuplesPerType[tupletype]);
01133     }
01134   }
01135 
01136   if (doVirial) {
01137 #ifdef WRITE_FULL_VIRIALS
01138 #ifdef USE_FP_VIRIAL
01139     convertVirial(&energies_virials[ComputeBondedCUDAKernel::normalVirialIndex_XX]);
01140     convertVirial(&energies_virials[ComputeBondedCUDAKernel::nbondVirialIndex_XX]);
01141     convertVirial(&energies_virials[ComputeBondedCUDAKernel::slowVirialIndex_XX]);
01142     convertVirial(&energies_virials[ComputeBondedCUDAKernel::amdDiheVirialIndex_XX]);
01143 #endif
01144 #else
01145 #error "non-WRITE_FULL_VIRIALS not implemented"
01146 #endif
01147     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,energies_virials, ComputeBondedCUDAKernel::normalVirialIndex);
01148     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NBOND, energies_virials, ComputeBondedCUDAKernel::nbondVirialIndex);
01149     ADD_TENSOR(reduction, REDUCTION_VIRIAL_SLOW,  energies_virials, ComputeBondedCUDAKernel::slowVirialIndex);
01150     ADD_TENSOR(reduction, REDUCTION_VIRIAL_AMD_DIHE, energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
01151     // NOTE: AMD_DIHE virial is also added to NORMAL virial.
01152     // This is what happens in ComputeDihedrals.C and ComputeCrossterms.C
01153     ADD_TENSOR(reduction, REDUCTION_VIRIAL_NORMAL,   energies_virials, ComputeBondedCUDAKernel::amdDiheVirialIndex);
01154   }
01155 
01156   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
01157   reduction->submit();
01158 }
01159 
01160 //
01161 // Can only be called by master PE
01162 //
01163 void ComputeBondedCUDA::initialize() {
01164 
01165   if (CkMyPe() != masterPe)
01166     NAMD_bug("ComputeBondedCUDA::initialize() called on non master PE");
01167 
01168   // Build list of PEs
01169   for (int rank=0;rank < computes.size();rank++) {
01170     if (computes[rank].selfComputes.size() > 0 || computes[rank].homeCompute.patchIDs.size() > 0) {
01171       pes.push_back(CkNodeFirst(CkMyNode()) + rank);
01172     }
01173   }
01174 
01175   // Return if no work to do
01176   if (pes.size() == 0) return;
01177 
01178   initializeCalled = true;
01179   cudaCheck(cudaSetDevice(deviceID));
01180 
01181 #if CUDA_VERSION >= 5050
01182   int leastPriority, greatestPriority;
01183   cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
01184   cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
01185 #else
01186   cudaCheck(cudaStreamCreate(&stream));
01187 #endif
01188   cudaCheck(cudaEventCreate(&forceDoneEvent));
01189   lock = CmiCreateLock();
01190 
01191   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
01192 
01193   PatchMap* patchMap = PatchMap::Object();
01194 
01195   // First, assign all patches in self computes.
01196   // NOTE: These never overlap between PEs. No proxies added.
01197   for (int rank=0;rank < computes.size();rank++) {
01198     std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
01199     for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
01200       for (auto jt=it->patchIDs.begin();jt != it->patchIDs.end();jt++) {
01201         if (!tuplePatchList.find( TuplePatchElem(*jt) ) ) {
01202           tuplePatchList.add( TuplePatchElem(*jt) );
01203           patchIDsPerRank[rank].push_back(*jt);
01204           allPatchIDs.push_back(*jt);
01205         }
01206       }
01207     }
01208   }
01209 
01210   // Second, assign all patches in home computes.
01211   // NOTE: The ranks always have these patches. No proxies added.
01212   for (int rank=0;rank < computes.size();rank++) {
01213     HomeCompute& homeCompute = computes[rank].homeCompute;
01214     std::vector<int>& patchIDs = homeCompute.patchIDs;
01215     for (int i=0;i < patchIDs.size();i++) {
01216       int patchID = patchIDs[i];
01217       if (!tuplePatchList.find( TuplePatchElem(patchID) ) ) {
01218         tuplePatchList.add( TuplePatchElem(patchID) );
01219         patchIDsPerRank[rank].push_back(patchID);
01220         allPatchIDs.push_back(patchID);
01221       }
01222     }
01223   }
01224 
01225   std::vector< std::vector<int> > patchIDsToAppend(CkMyNodeSize());
01226   // Find neighbors that are not added yet
01227   std::vector<int> neighborPids;
01228   for (int rank=0;rank < computes.size();rank++) {
01229     PatchID neighbors[PatchMap::MaxOneOrTwoAway];
01230     HomeCompute& homeCompute = computes[rank].homeCompute;
01231     std::vector<int>& patchIDs = homeCompute.patchIDs;
01232     for (int i=0;i < patchIDs.size();i++) {
01233       int patchID = patchIDs[i];
01234       int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
01235       for (int j=0;j < numNeighbors;j++) {
01236         if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
01237           neighborPids.push_back(neighbors[j]);
01238         }
01239       }
01240     }
01241   }
01242   // Remove duplicates from neighborPids
01243   {
01244     std::sort(neighborPids.begin(), neighborPids.end());
01245     auto it_end = std::unique(neighborPids.begin(), neighborPids.end());
01246     neighborPids.resize(std::distance(neighborPids.begin(), it_end));
01247   }
01248   // Assign neighbors to the PEs on this node that have them
01249   for (int i=0;i < neighborPids.size();i++) {
01250     for (int rank=0;rank < computes.size();rank++) {
01251       int pid = neighborPids[i];
01252       int pe = rank + CkNodeFirst(CkMyNode());
01253       if (patchMap->node(pid) == pe) {
01254         // Patch pid found on PE "pe" on this node
01255         tuplePatchList.add( TuplePatchElem(pid) );
01256         patchIDsPerRank[rank].push_back(pid);
01257         allPatchIDs.push_back(pid);
01258         // Add to this rank's patches
01259         patchIDsToAppend[rank].push_back(pid);
01260         // Add to the list of PEs
01261         pes.push_back(CkNodeFirst(CkMyNode()) + rank);
01262         break;
01263       }
01264     }
01265   }
01266   // Remove duplicates from pes
01267   {
01268     std::sort(pes.begin(), pes.end());
01269     auto it_end = std::unique(pes.begin(), pes.end());
01270     pes.resize(std::distance(pes.begin(), it_end));
01271   }
01272   
01273   // Last, assign all patches in neighbors of home computes
01274   // NOTE: Will create proxies on multiple nodes
01275   for (int rank=0;rank < computes.size();rank++) {
01276     PatchID neighbors[PatchMap::MaxOneOrTwoAway];
01277     HomeCompute& homeCompute = computes[rank].homeCompute;
01278     std::vector<int>& patchIDs = homeCompute.patchIDs;
01279     std::vector<int> neighborPatchIDs;
01280     for (int i=0;i < patchIDs.size();i++) {
01281       int patchID = patchIDs[i];
01282       int numNeighbors = patchMap->upstreamNeighbors(patchID, neighbors);
01283       for (int j=0;j < numNeighbors;j++) {
01284         if (!tuplePatchList.find( TuplePatchElem(neighbors[j]) ) ) {
01285           // Patch not found => Add Proxy
01286           tuplePatchList.add( TuplePatchElem(neighbors[j]) );
01287           patchIDsPerRank[rank].push_back(neighbors[j]);
01288           allPatchIDs.push_back(neighbors[j]);
01289         }
01290         if ( std::count(patchIDs.begin(), patchIDs.end(), neighbors[j]) == 0 
01291           && std::count(neighborPatchIDs.begin(), neighborPatchIDs.end(), neighbors[j]) == 0 ) {
01292           neighborPatchIDs.push_back(neighbors[j]);
01293         }
01294       }
01295     }
01296     // Append neighboring patchIDs to homeCompute.patchIDs
01297     // int start = patchIDs.size();
01298     // patchIDs.resize(patchIDs.size() + neighborPatchIDs.size());
01299     // for (int i=0;i < neighborPatchIDs.size();i++) {
01300     //   patchIDs[start + i] = neighborPatchIDs[i];
01301     // }
01302     for (int i=0;i < neighborPatchIDs.size();i++) {
01303       patchIDsToAppend[rank].push_back(neighborPatchIDs[i]);
01304     }
01305   }
01306 
01307   for (int rank=0;rank < patchIDsToAppend.size();rank++) {
01308     for (int i=0;i < patchIDsToAppend[rank].size();i++) {
01309       computes[rank].homeCompute.patchIDs.push_back(patchIDsToAppend[rank][i]);
01310     }
01311   }
01312 
01313   // Remove duplicate patch IDs
01314   {
01315     std::sort(allPatchIDs.begin(), allPatchIDs.end());
01316     auto it_end = std::unique(allPatchIDs.begin(), allPatchIDs.end());
01317     allPatchIDs.resize(std::distance(allPatchIDs.begin(), it_end));
01318   }
01319 
01320   // Set number of (unique) patches
01321   setNumPatches(allPatchIDs.size());
01322 
01323   // Reset patchesCounter
01324   patchesCounter = getNumPatches();
01325 
01326   patches.resize(getNumPatches());
01327 
01328   // Setup tupleList
01329   for (int rank=0;rank < computes.size();rank++) {
01330     std::vector< SelfCompute >& selfComputes = computes[rank].selfComputes;
01331     for (auto it=selfComputes.begin();it != selfComputes.end();it++) {
01332       tupleList[it->tuples->getType()].push_back(it->tuples);
01333     }
01334     HomeCompute& homeCompute = computes[rank].homeCompute;
01335     for (int i=0;i < homeCompute.tuples.size();i++) {
01336       tupleList[homeCompute.tuples[i]->getType()].push_back(homeCompute.tuples[i]);
01337     }
01338   }  
01339 
01340   // Allocate host memory for energies and virials
01341   allocate_host<double>(&energies_virials, ComputeBondedCUDAKernel::energies_virials_SIZE);
01342 
01343   // Finally, do sanity checks
01344   std::vector<char> patchIDset(patchMap->numPatches(), 0);
01345   int numPatchIDset = 0;
01346   int numPatchIDs = 0;
01347   for (int rank=0;rank < computes.size();rank++) {
01348     numPatchIDs += patchIDsPerRank[rank].size();
01349     for (int i=0;i < patchIDsPerRank[rank].size();i++) {
01350       PatchID patchID = patchIDsPerRank[rank][i];
01351       if (patchIDset[patchID] == 0) numPatchIDset++;
01352       patchIDset[patchID] = 1;
01353       if ( !std::count(allPatchIDs.begin(), allPatchIDs.end(), patchID) ) {
01354         NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
01355       }
01356     }
01357   }
01358   if (numPatchIDs != getNumPatches() || numPatchIDset != getNumPatches()) {
01359     NAMD_bug("ComputeBondedCUDA::initialize(), inconsistent patch mapping");
01360   }
01361 
01362   // Warning: Direct indexing used, patchIndex could use up a lot of memory for large systems
01363   patchIndex.resize(patchMap->numPatches());
01364   atomMappers.resize(getNumPatches());
01365   for (int i=0;i < getNumPatches();i++) {
01366     atomMappers[i] = new AtomMapper(allPatchIDs[i], &atomMap);
01367     patchIndex[allPatchIDs[i]] = i;
01368   }
01369 
01370   // Copy coefficients to GPU
01371   Parameters* parameters = Node::Object()->parameters;
01372   for (int tupletype=0;tupletype < Tuples::NUM_TUPLE_TYPES;tupletype++) {
01373     if (tupleList[tupletype].size() > 0) {
01374       switch(tupletype) {
01375 
01376         case Tuples::BOND:
01377         {
01378           int NumBondParams = parameters->NumBondParams;
01379           BondValue* bond_array = parameters->bond_array;
01380           std::vector<CudaBondValue> bondValues(NumBondParams);
01381           for (int i=0;i < NumBondParams;i++) {
01382             bondValues[i].k  = bond_array[i].k;
01383             bondValues[i].x0 = bond_array[i].x0;
01384           }
01385           bondedKernel.setupBondValues(NumBondParams, bondValues.data());
01386         }
01387         break;
01388 
01389         case Tuples::ANGLE:
01390         {
01391           int NumAngleParams = parameters->NumAngleParams;
01392           AngleValue* angle_array = parameters->angle_array;
01393           std::vector<CudaAngleValue> angleValues(NumAngleParams);
01394           bool normal_ub_error = false;
01395           for (int i=0;i < NumAngleParams;i++) {
01396             angleValues[i].k      = angle_array[i].k;
01397             if (angle_array[i].normal == 1) {
01398               angleValues[i].theta0 = angle_array[i].theta0;
01399             } else {
01400               angleValues[i].theta0 = cos(angle_array[i].theta0);
01401             }
01402             normal_ub_error |= (angle_array[i].normal == 0 && angle_array[i].k_ub);
01403             angleValues[i].k_ub   = angle_array[i].k_ub;
01404             angleValues[i].r_ub   = angle_array[i].r_ub;
01405             angleValues[i].normal = angle_array[i].normal;
01406           }
01407           if (normal_ub_error) NAMD_die("ERROR: Can't use cosAngles with Urey-Bradley angles");
01408           bondedKernel.setupAngleValues(NumAngleParams, angleValues.data());
01409         }
01410         break;
01411 
01412         case Tuples::DIHEDRAL:
01413         {
01414           int NumDihedralParams = parameters->NumDihedralParams;
01415           DihedralValue* dihedral_array = parameters->dihedral_array;
01416           int NumDihedralParamsMult = 0;
01417           for (int i=0;i < NumDihedralParams;i++) {
01418             NumDihedralParamsMult += std::max(0, dihedral_array[i].multiplicity);
01419           }
01420           std::vector<CudaDihedralValue> dihedralValues(NumDihedralParamsMult);
01421           dihedralMultMap.resize(NumDihedralParams);
01422           int k = 0;
01423           for (int i=0;i < NumDihedralParams;i++) {
01424             int multiplicity = dihedral_array[i].multiplicity;
01425             dihedralMultMap[i] = k;
01426             for (int j=0;j < multiplicity;j++) {
01427               dihedralValues[k].k     = dihedral_array[i].values[j].k;
01428               dihedralValues[k].n     = (dihedral_array[i].values[j].n << 1) | (j < (multiplicity - 1));
01429               dihedralValues[k].delta = dihedral_array[i].values[j].delta;
01430               k++;
01431             }
01432           }
01433           bondedKernel.setupDihedralValues(NumDihedralParamsMult, dihedralValues.data());
01434         }
01435         break;
01436 
01437         case Tuples::IMPROPER:
01438         {
01439           int NumImproperParams = parameters->NumImproperParams;
01440           ImproperValue* improper_array = parameters->improper_array;
01441           int NumImproperParamsMult = 0;
01442           for (int i=0;i < NumImproperParams;i++) {
01443             NumImproperParamsMult += std::max(0, improper_array[i].multiplicity);
01444           }
01445           std::vector<CudaDihedralValue> improperValues(NumImproperParamsMult);
01446           improperMultMap.resize(NumImproperParams);
01447           int k = 0;
01448           for (int i=0;i < NumImproperParams;i++) {
01449             int multiplicity = improper_array[i].multiplicity;
01450             improperMultMap[i] = k;
01451             for (int j=0;j < multiplicity;j++) {
01452               improperValues[k].k     = improper_array[i].values[j].k;
01453               improperValues[k].n     = (improper_array[i].values[j].n << 1) | (j < (multiplicity - 1));
01454               improperValues[k].delta = improper_array[i].values[j].delta;
01455               k++;
01456             }
01457           }
01458           bondedKernel.setupImproperValues(NumImproperParamsMult, improperValues.data());
01459         }
01460         break;
01461 
01462         case Tuples::CROSSTERM:
01463         {
01464           int NumCrosstermParams = parameters->NumCrosstermParams;
01465           CrosstermValue* crossterm_array = parameters->crossterm_array;
01466           std::vector<CudaCrosstermValue> crosstermValues(NumCrosstermParams);
01467           const int D = CrosstermValue::dim;
01468           const int N = CrosstermValue::dim - 1;
01469           for (int ipar=0;ipar < NumCrosstermParams;ipar++) {
01470             for (int i=0;i < N;i++) {
01471               for (int j=0;j < N;j++) {
01472 
01473                 // Setups coefficients for bi-cubic interpolation.
01474                 // See https://en.wikipedia.org/wiki/Bicubic_interpolation
01475 
01476                 #define INDEX(ncols,i,j)  ((i)*ncols + (j))
01477                 CrosstermData* table = &crossterm_array[ipar].c[0][0];
01478 
01479                 const double Ainv[16][16] = {
01480                   { 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01481                   { 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01482                   {-3,  3,  0,  0, -2, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01483                   { 2, -2,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
01484                   { 0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0},
01485                   { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0},
01486                   { 0,  0,  0,  0,  0,  0,  0,  0, -3,  3,  0,  0, -2, -1,  0,  0},
01487                   { 0,  0,  0,  0,  0,  0,  0,  0,  2, -2,  0,  0,  1,  1,  0,  0},
01488                   {-3,  0,  3,  0,  0,  0,  0,  0, -2,  0, -1,  0,  0,  0,  0,  0},
01489                   { 0,  0,  0,  0, -3,  0,  3,  0,  0,  0,  0,  0, -2,  0, -1,  0},
01490                   { 9, -9, -9,  9,  6,  3, -6, -3,  6, -6,  3, -3,  4,  2,  2,  1},
01491                   {-6,  6,  6, -6, -3, -3,  3,  3, -4,  4, -2,  2, -2, -2, -1, -1},
01492                   { 2,  0, -2,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0},
01493                   { 0,  0,  0,  0,  2,  0, -2,  0,  0,  0,  0,  0,  1,  0,  1,  0},
01494                   {-6,  6,  6, -6, -4, -2,  4,  2, -3,  3, -3,  3, -2, -1, -2, -1},
01495                   { 4, -4, -4,  4,  2,  2, -2, -2,  2, -2,  2, -2,  1,  1,  1,  1}
01496                 };
01497 
01498 #ifndef M_PI
01499 #define M_PI 3.14159265358979323846
01500 #endif
01501 
01502                 const double h = M_PI/12.0;
01503 
01504                 const double x[16] = {
01505                   table[INDEX(D,i,j)].d00, table[INDEX(D,i+1,j)].d00, table[INDEX(D,i,j+1)].d00, table[INDEX(D,i+1,j+1)].d00,
01506                   table[INDEX(D,i,j)].d10*h, table[INDEX(D,i+1,j)].d10*h, table[INDEX(D,i,j+1)].d10*h, table[INDEX(D,i+1,j+1)].d10*h,
01507                   table[INDEX(D,i,j)].d01*h, table[INDEX(D,i+1,j)].d01*h, table[INDEX(D,i,j+1)].d01*h, table[INDEX(D,i+1,j+1)].d01*h,
01508                   table[INDEX(D,i,j)].d11*h*h, table[INDEX(D,i+1,j)].d11*h*h, table[INDEX(D,i,j+1)].d11*h*h, table[INDEX(D,i+1,j+1)].d11*h*h
01509                 };
01510 
01511                 // a = Ainv*x
01512                 float* a = (float *)&crosstermValues[ipar].c[i][j][0];
01513                 for (int k=0;k < 16;k++) {
01514                   double a_val = 0.0;
01515                   for (int l=0;l < 16;l++) {
01516                     a_val += Ainv[k][l]*x[l];
01517                   }
01518                   a[k] = (float)a_val;
01519                 }
01520 
01521               }
01522             }
01523           }
01524           bondedKernel.setupCrosstermValues(NumCrosstermParams, crosstermValues.data());
01525         }
01526         break;
01527 
01528         case Tuples::EXCLUSION:
01529         // Nothing to do
01530         break;
01531 
01532         default:
01533         NAMD_bug("ComputeBondedCUDA::initialize, Undefined tuple type");
01534         break;
01535       }
01536     }
01537   }
01538 
01539   computeMgr->sendAssignPatchesOnPe(pes, this);
01540 }
01541 
01542 #endif // BONDED_CUDA
01543 #endif // NAMD_CUDA

Generated on Wed Nov 22 01:17:13 2017 for NAMD by  doxygen 1.4.7