NAMD
Sequencer.C
Go to the documentation of this file.
1 
7 /*****************************************************************************
8  * $Source: /home/cvs/namd/cvsroot/namd2/src/Sequencer.C,v $
9  * $Author: jim $
10  * $Date: 2016/08/26 19:40:32 $
11  * $Revision: 1.1230 $
12  *****************************************************************************/
13 
14 // The UPPER_BOUND macro is used to eliminate all of the per atom
15 // computation done for the numerical integration in Sequencer::integrate()
16 // other than the actual force computation and atom migration.
17 // The idea is to "turn off" the integration for doing performance
18 // profiling in order to get an upper bound on the speedup available
19 // by moving the integration parts to the GPU.
20 //
21 // Define it in the Make.config file, i.e. CXXOPTS += -DUPPER_BOUND
22 // or simply uncomment the line below.
23 //
24 //#define UPPER_BOUND
25 
26 //for gbis debugging; print net force on each atom
27 #include "CudaRecord.h"
28 #include "PatchData.h"
29 #include "common.h"
30 #define PRINT_FORCES 0
31 
32 #include "InfoStream.h"
33 #include "Node.h"
34 #include "SimParameters.h"
35 #include "Sequencer.h"
36 #include "HomePatch.h"
37 #include "ReductionMgr.h"
38 #include "CollectionMgr.h"
39 #include "BroadcastObject.h"
40 #include "Output.h"
41 #include "Controller.h"
42 #include "Broadcasts.h"
43 #include "Molecule.h"
44 #include "NamdOneTools.h"
45 #include "LdbCoordinator.h"
46 #include "Thread.h"
47 #include "Random.h"
48 #include "PatchMap.inl"
49 #include "ComputeMgr.h"
50 #include "ComputeGlobal.h"
51 #include "NamdEventsProfiling.h"
52 #include <iomanip>
53 #include "ComputeCUDAMgr.h"
54 #include "CollectionMaster.h"
55 #include "IMDOutput.h"
56 #include "CudaGlobalMasterServer.h"
57 
58 #include "TestArray.h"
59 
60 #include <algorithm> // Used for sorting
61 
62 #define MIN_DEBUG_LEVEL 3
63 //#define DEBUGM
64 //
65 // Define NL_DEBUG below to activate D_*() macros in integrate_SOA()
66 // for debugging.
67 //
68 //#define NL_DEBUG
69 #include "Debug.h"
70 
71 #if USE_HPM
72 #define START_HPM_STEP 1000
73 #define STOP_HPM_STEP 1500
74 #endif
75 
76 #include "DeviceCUDA.h"
77 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
78 #ifdef WIN32
79 #define __thread __declspec(thread)
80 #endif
81 extern __thread DeviceCUDA *deviceCUDA;
82 #ifdef __IBMCPP__
83 // IBM compiler requires separate definition for static members
85 #endif
86 #endif
87 
88 #define SPECIAL_PATCH_ID 91
89 
90 //
91 // BEGIN
92 // print_* routines
93 // assist in debugging SOA integration code
94 //
95 static void print_vel_AOS(
96  const FullAtom *a,
97  int ilo=0, int ihip1=1
98  ) {
99  printf("AOS Velocities:\n");
100  for (int i=ilo; i < ihip1; i++) {
101  printf("%d %g %g %g\n", i,
102  a[i].velocity.x, a[i].velocity.y, a[i].velocity.z);
103  }
104 }
105 
106 
107 static void print_vel_SOA(
108  const double *vel_x,
109  const double *vel_y,
110  const double *vel_z,
111  int ilo=0, int ihip1=1
112  ) {
113  printf("SOA Velocities:\n");
114  for (int i=ilo; i < ihip1; i++) {
115  printf("%d %g %g %g\n", i, vel_x[i], vel_y[i], vel_z[i]);
116  }
117 }
118 
119 
120 static void print_tensor(const Tensor& t) {
121  printf("%g %g %g %g %g %g %g %g %g\n",
122  t.xx, t.xy, t.xz, t.yx, t.yy, t.yz, t.zx, t.zy, t.zz);
123 }
124 //
125 // END
126 // print_* routines
127 // assist in debugging SOA integration code
128 //
129 
130 
142 struct CheckStep {
143  int period;
144  int nextstep;
145 
149  inline int check(int step) {
150  if (step == nextstep) return( nextstep += period, 1 );
151  else return 0;
152  }
153 
159  inline int init(int initstep, int initperiod, int delta=0) {
160  period = initperiod;
161  nextstep = initstep - (initstep % period) - (delta % period);
162  while (nextstep <= initstep) nextstep += period;
163  // returns true if initstep is divisible by period
164  return (initstep + period == nextstep);
165  }
166 
167  CheckStep() : period(0), nextstep(0) { }
168 };
169 
170 
172  simParams(Node::Object()->simParameters),
173  patch(p),
174  collection(CollectionMgr::Object()),
175  ldbSteps(0),
176  pairlistsAreValid(0),
177  pairlistsAge(0),
178  pairlistsAgeLimit(0)
179 {
187  int ntypes = simParams->pressureProfileAtomTypes;
188  int nslabs = simParams->pressureProfileSlabs;
191  REDUCTIONS_PPROF_INTERNAL, 3*nslabs*ntypes);
192  } else {
194  }
195  if (simParams->multigratorOn) {
197  } else {
198  multigratorReduction = NULL;
199  }
200  ldbCoordinator = (LdbCoordinator::Object());
203 
204  // Is soluteScaling enabled?
205  if (simParams->soluteScalingOn) {
206  // If so, we must "manually" perform charge scaling on startup because
207  // Sequencer will not get a scripting task for initial charge scaling.
208  // Subsequent rescalings will take place through a scripting task.
210  }
211 
213  stochRescale_count = 0;
215  masterThread = true;
216 // patch->write_tip4_props();
217 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(SEQUENCER_SOA) && defined(NODEGROUP_FORCE_REGISTER)
219 #if 0
220  CUDASequencer = new SequencerCUDA(deviceCUDA->getDeviceID(),
221  simParams);
222 #else
223  CUDASequencer = SequencerCUDA::InstanceInit(deviceCUDA->getDeviceID(),
224  simParams);
225 
227 #endif
228  CUDASequencer->patchData->reduction->zero();
229  }
230 #endif
231 }
232 
234 {
235  delete broadcast;
236  delete reduction;
237  delete min_reduction;
239  delete random;
241 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(SEQUENCER_SOA) && defined(NODEGROUP_FORCE_REGISTER)
243  delete CUDASequencer;
245  }
246 #endif
247 }
248 
249 // Invoked by thread
250 void Sequencer::threadRun(Sequencer* arg)
251 {
253  arg->algorithm();
254 }
255 
256 // Invoked by Node::run() via HomePatch::runSequencer()
257 void Sequencer::run(void)
258 {
259  // create a Thread and invoke it
260  DebugM(4, "::run() - this = " << this << "\n" );
261  thread = CthCreate((CthVoidFn)&(threadRun),(void*)(this),SEQ_STK_SZ);
262  CthSetStrategyDefault(thread);
263  priority = PATCH_PRIORITY(patch->getPatchID());
264  awaken();
265 }
266 
268 {
270  CthSuspend();
272 }
273 
274 // Defines sequence of operations on a patch. e.g. when
275 // to push out information for Compute objects to consume
276 // when to migrate atoms, when to add forces to velocity update.
278 {
279  int scriptTask;
280  int scriptSeq = 0;
281  // Blocking receive for the script barrier.
282  while ( (scriptTask = broadcast->scriptBarrier.get(scriptSeq++)) != SCRIPT_END ) {
283  switch ( scriptTask ) {
284  case SCRIPT_OUTPUT:
286  break;
287  case SCRIPT_FORCEOUTPUT:
289  break;
290  case SCRIPT_MEASURE:
292  break;
293  case SCRIPT_REINITVELS:
295  break;
296  case SCRIPT_RESCALEVELS:
298  break;
301  break;
303  reloadCharges();
304  break;
305  case SCRIPT_CHECKPOINT:
306  patch->checkpoint();
308  break;
309  case SCRIPT_REVERT:
310  patch->revert();
312  pairlistsAreValid = 0;
313  break;
319  break;
320  case SCRIPT_ATOMSENDRECV:
321  case SCRIPT_ATOMSEND:
322  case SCRIPT_ATOMRECV:
323  patch->exchangeAtoms(scriptTask);
324  break;
325  case SCRIPT_MINIMIZE:
326 #if 0
328  NAMD_die("Minimization is currently not supported on the GPU integrator\n");
329  }
330 #endif
331  minimize();
332  break;
333  case SCRIPT_RUN:
334  case SCRIPT_CONTINUE:
335  //
336  // DJH: Call a cleaned up version of integrate().
337  //
338  // We could test for simulation options and call a more basic version
339  // of integrate() where we can avoid performing most tests.
340  //
341 #ifdef SEQUENCER_SOA
342  if ( simParams->SOAintegrateOn ) {
343 #ifdef NODEGROUP_FORCE_REGISTER
344 
346  else {
347 #endif
348  integrate_SOA(scriptTask);
349 #ifdef NODEGROUP_FORCE_REGISTER
350  }
351 #endif
352  }
353  else
354 #endif
355  integrate(scriptTask);
356  break;
357  default:
358  NAMD_bug("Unknown task in Sequencer::algorithm");
359  }
360  }
362  terminate();
363 }
364 
365 
366 #ifdef SEQUENCER_SOA
367 
369 //
370 // begin SOA code
371 //
372 
373 #if defined(NODEGROUP_FORCE_REGISTER)
374 
376  PatchMap* patchMap = PatchMap::Object();
377  CUDASequencer->numPatchesCheckedIn += 1;
378  if (CUDASequencer->numPatchesCheckedIn < patchMap->numPatchesOnNode(CkMyPe())) {
379  masterThread = false;
380  CUDASequencer->waitingThreads.push_back(CthSelf());
381  NAMD_EVENT_STOP(patch->flags.event_on, NamdProfileEvent::INTEGRATE_SOA_1);
382  CthSuspend();
383 
384  // JM: if a thread get here, it will be for migrating atoms until the end of the simulation
385  while(true){
386  // read global flags
387  int lastStep = CUDASequencer->patchData->flags.step;
388  int startup = (CUDASequencer->patchData->flags.step == simParams->firstTimestep);
389  if (CUDASequencer->breakSuspends) break;
391  this->patch->positionsReady_GPU(true, startup);
392  } else {
393  this->patch->positionsReady_SOA(true);
394  }
395  CUDASequencer->numPatchesCheckedIn += 1;
396  CUDASequencer->waitingThreads.push_back(CthSelf());
397  if(CUDASequencer->numPatchesCheckedIn == patchMap->numPatchesOnNode(CkMyPe()) - 1 &&
398  CUDASequencer->masterThreadSleeping){
399  CUDASequencer->masterThreadSleeping = false;
400  CthAwaken(CUDASequencer->masterThread);
401  }
402  CthSuspend();
403  }
404  }
405 }
406 void Sequencer::wakeULTs(){
407  CUDASequencer->numPatchesCheckedIn = 0;
408  for (CthThread t : CUDASequencer->waitingThreads) {
409  CthAwaken(t);
410  }
411  CUDASequencer->waitingThreads.clear();
412 }
413 
414 void Sequencer::runComputeObjectsCUDA(int doMigration, int doGlobal, int pairlists, int nstep, int startup) {
415 
416  PatchMap* map = PatchMap::Object();
417 
418  bool isMaster = deviceCUDA->getMasterPe() == CkMyPe();
419  CmiNodeBarrier();
420 
421  // Sync after the node barrier. This is making sure that the position buffers have been
422  // populated. However, this doesn't need to happen at the node level. I.e. the non-pme
423  // nonbonded calculations can begin before the PME device is finished setting it's positions.
424  // There is a node barrier after the forces are done, so we don't have to worry about
425  // the positions being updated before the positions have been set
426  if (isMaster) {
427  CUDASequencer->sync();
428  }
429 
430 
431  // JM: Each masterPE owns a particular copy of the compute object we need to launch
432  // work on. The goal is to launch work on everyone, but for migration steps, sometimes
433  // there are a few operation that need to be launched on computes owned by different PEs.
434  // ComputeBondedCUDA::openBoxesOnPe() is an example: There is a list of PEs on each compute
435  // which holds information on which proxy object it should also invoke openBoxesOnPe();
436 
437  // We need to be mindful of that and, since we want to launch methods on different computes.
438  // A data structure that holds all nonbonded Computes from all masterPEs is necessary
440  ComputeBondedCUDA* cudaBond = cudaMgr->getComputeBondedCUDA();
441  CudaComputeNonbonded* cudaNbond = cudaMgr->getCudaComputeNonbonded();
443  cudaMgr->getCudaPmeOneDevice() : NULL );
444  int reducePme = ( cudaPme && patch->flags.doVirial );
445 #ifdef NAMD_CUDA
446  auto cudaGlobal = deviceCUDA->getIsGlobalDevice() ? cudaMgr->getCudaGlobalMaster() : nullptr;
447  if (isMaster && cudaGlobal) cudaGlobal->setStep(static_cast<int64_t>(patch->flags.step));
448 #endif // NAMD_CUDA
449  // fprintf(stderr, "Patch %d invoking computes\n", this->patch->patchID);
450 
451 
452  // JM NOTE: I don't think the scheme below holds for nMasterPes > 1, check it out laters
453 
454  // Invoking computes on the GPU //
455  if(doMigration){
456  // JM: if we're on a migration step, we call the setup functions manually
457  // which means:
458  // 0. masterPe->doWork();
459  // 1. openBoxesOnPe();
460  // loadTuplesOnPe();
461  // 2. masterPe->launchWork();
462  // 3. finishPatchesOnPe();
463  // 4. masterPe->finishReductions();
464 
465  if(isMaster){
466  NAMD_EVENT_START(1, NamdProfileEvent::MIG_ATOMUPDATE);
467  cudaNbond->atomUpdate();
468  cudaBond->atomUpdate();
469  cudaNbond->doWork();
470  cudaBond->doWork();
471  NAMD_EVENT_STOP(1, NamdProfileEvent::MIG_ATOMUPDATE);
472 
473  if (cudaPme && !simParams->useDeviceMigration) CUDASequencer->atomUpdatePme();
474 #ifdef NAMD_CUDA
475  if (cudaGlobal) {
476  cudaGlobal->updateAtomMaps();
477  }
478 #endif // NAMD_CUDA
479  }
480 
481  CmiNodeBarrier();
482 
484  if(isMaster){
485  CUDASequencer->launch_set_compute_positions();
486  CUDASequencer->sync(); // TODO move this to tuple migration
487  }
488  CmiNodeBarrier();
489  }
490 
491  NAMD_EVENT_START(1, NamdProfileEvent::MIG_OPENBOXESONPE);
492 
493  // Here we need to do the following, for each Comput
494  for(int i = 0 ; i < CkNumPes(); i++){
495  // Here I need to find if the PE is on the bonded PE list
496  // XXX NOTE: This might be inefficient. Check the overhead later
497  ComputeBondedCUDA* b = CUDASequencer->patchData->cudaBondedList[i];
498  CudaComputeNonbonded* nb = CUDASequencer->patchData->cudaNonbondedList[i];
499  if (b == NULL) continue;
500  auto list = std::find(std::begin(b->getBondedPes()), std::end(b->getBondedPes()), CkMyPe());
501  if( list != std::end(b->getBondedPes()) ){
502  b->openBoxesOnPe(startup);
503 
504  // XXX NOTE: nb has a differente PE list!!! We need a different loop for nb
505  nb->openBoxesOnPe();
506 
507  }
508  CmiNodeBarrier();
509  }
510  NAMD_EVENT_STOP(1, NamdProfileEvent::MIG_OPENBOXESONPE);
511  // for the bonded kernels, there's an additional step here, loadTuplesOnPe
512  // JM NOTE: Those are major hotspots, they account for 50% of the migration time.
513  CmiNodeBarrier();
514  NAMD_EVENT_START(1, NamdProfileEvent::MIG_LOADTUPLESONPE);
515 
516  // NOTE: problem here: One of the CompAtomExt structures is turning to null, why?
517  cudaBond->loadTuplesOnPe(startup);
518  NAMD_EVENT_STOP(1, NamdProfileEvent::MIG_LOADTUPLESONPE);
519  CmiNodeBarrier();
520  NAMD_EVENT_START(1, NamdProfileEvent::MIG_COPYTUPLEDATA);
521 
523  cudaBond->copyTupleDataGPU(startup);
524  } else {
525  cudaBond->copyTupleDataSN();
526  }
527 
528  NAMD_EVENT_STOP(1, NamdProfileEvent::MIG_COPYTUPLEDATA);
529  // waits until everyone has finished to open their respective boxes
530  // node barrier actually prevents the error that is happening.
531  CmiNodeBarrier();
532  if(isMaster){
533  // launches work on the masterPe
534  NAMD_EVENT_START(1, NamdProfileEvent::MIG_LAUNCHWORK);
535  cudaBond->launchWork();
536  cudaNbond->launchWork();
537  if (cudaPme) {
538  cudaPme->compute(*(CUDASequencer->patchData->lat), reducePme, this->patch->flags.step);
539  }
540  cudaNbond->reSortTileLists();
541 #ifdef NAMD_CUDA
542  if (cudaGlobal) {
543  cudaGlobal->communicateToClients(&(this->patch->lattice));
544  cudaGlobal->communicateToMD();
545  }
546 #endif // NAMD_CUDA
547  NAMD_EVENT_STOP(1, NamdProfileEvent::MIG_LAUNCHWORK);
548  }
549 
550  CmiNodeBarrier();
551  //global master force calculation
552 
553  if(doGlobal) {
554  NAMD_EVENT_START(1, NamdProfileEvent::GM_CALCULATE);
556  // Zero all SOA global forces before computing next global force
557  NAMD_EVENT_START(1, NamdProfileEvent::GM_ZERO);
558  int numhp = PatchMap::Object()->numHomePatches();
560  for(int i = 0; i < numhp; ++i) {
561  HomePatch *hp = hpList->item(i).patch;
562  hp->zero_global_forces_SOA();
563  }
564  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_ZERO);
565  NAMD_EVENT_START(1, NamdProfileEvent::GM_DOWORK);
566  // call globalmaster to calculate the force from client.
567  computeGlobal->doWork();
568  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_DOWORK);
569  NAMD_EVENT_START(1, NamdProfileEvent::GM_BARRIER);
570  CmiNodeBarrier();
571  // CkPrintf("post doWork step %d \n",this->patch->flags.step);
572  // CUDASequencer->printSOAPositionsAndVelocities();
573  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_BARRIER);
574  if(isMaster) {
575  // aggregate and copy the global forces to d_f_global device buffer
576  NAMD_EVENT_START(1, NamdProfileEvent::GM_CPY_FORCE);
577  CUDASequencer->copyGlobalForcesToDevice();
578  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CPY_FORCE);
579  }
580  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CALCULATE);
581  }
582  // if (cudaGlobal && isMaster) {
583  // cudaGlobal->communicateToClients(&(this->patch->lattice));
584  // cudaGlobal->communicateToMD();
585  // }
586  // CmiNodeBarrier();
587  NAMD_EVENT_START(1, NamdProfileEvent::MIG_FINISHPATCHES);
588  cudaNbond->finishPatches();
589  cudaBond->finishPatches();
590  NAMD_EVENT_STOP(1, NamdProfileEvent::MIG_FINISHPATCHES);
591  CmiNodeBarrier();
592 
593  // finishes reduction with masterPe!
594  if(isMaster){
595  cudaNbond->finishReductions();
596  if (cudaPme) cudaPme->finishReduction(reducePme);
597  cudaBond->finishReductions();
598 #ifdef NAMD_CUDA
599  if (cudaGlobal) cudaGlobal->finishReductions(
601  CUDASequencer->patchData->reduction);
602 #endif // NAMD_CUDA
603  }
604  CmiNodeBarrier();
605  }
606  // if we're not on a migration step, do the work only on masterPE, except globalmaster work
607  else {
608  int doNbond = patch->flags.doNonbonded;
609  if(isMaster) {
610  // JM NOTE: We issue the nonbonded work first and sync it last
611  if (cudaPme) {
612  // cudaPme->compute(this->patch->lattice, reducePme, this->patch->flags.step);
613  cudaPme->compute(*(CUDASequencer->patchData->lat), reducePme, this->patch->flags.step);
614  }
615  if (doNbond) cudaNbond->doWork();
616  cudaBond->doWork();
617 #ifdef NAMD_CUDA
618  if (cudaGlobal) {
619  cudaGlobal->communicateToClients(&(this->patch->lattice));
620  cudaGlobal->communicateToMD();
621  }
622 #endif // NAMD_CUDA
623  }
624  //global master force calculation
625  if(doGlobal) {
626  NAMD_EVENT_START(1, NamdProfileEvent::GM_CALCULATE);
627  NAMD_EVENT_START(1, NamdProfileEvent::GM_ZERO);
629  // Zero all SOA global forces before computing next global force
630  int numhp = PatchMap::Object()->numHomePatches();
632  for(int i = 0; i < numhp; ++i) {
633  HomePatch *hp = hpList->item(i).patch;
634  hp->zero_global_forces_SOA();
635  }
636  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_ZERO);
637  // call globalmaster to calculate the force from client.
638  NAMD_EVENT_START(1, NamdProfileEvent::GM_DOWORK);
639  computeGlobal->doWork();
640  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_DOWORK);
641  NAMD_EVENT_START(1, NamdProfileEvent::GM_BARRIER);
642  CmiNodeBarrier();
643  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_BARRIER);
644  // CkPrintf("post doWork 2 step %d \n",this->patch->flags.step);
645  // CUDASequencer->printSOAPositionsAndVelocities();
646  if(isMaster) {
647  // aggregate and copy the global forces to d_f_global device buffer
648  NAMD_EVENT_START(1, NamdProfileEvent::GM_CPY_FORCE);
649  CUDASequencer->copyGlobalForcesToDevice();
650  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CPY_FORCE);
651  }
652  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CALCULATE);
653  }
654  // if (cudaGlobal && isMaster) {
655  // cudaGlobal->communicateToClients(&(this->patch->lattice));
656  // cudaGlobal->communicateToMD();
657  // }
658  if(isMaster) {
659  cudaBond->finishPatches();
660  if (cudaPme) {
661  cudaPme->finishReduction(reducePme);
662  }
663  if (doNbond) cudaNbond->finishPatches();
664 #ifdef NAMD_CUDA
665  if (cudaGlobal) cudaGlobal->finishReductions(
667  CUDASequencer->patchData->reduction);
668 #endif // NAMD_CUDA
669  }
670  }
671 
672 #if 0
673  // for migrations, I need to call OpenBoxesOnPe and finishPatches for every Pe
675  pairlistsAreValid = 1;
676  pairlistsAge = 0;
677  }
678  if ( pairlistsAreValid /* && !pressureStep */ ) ++pairlistsAge;
679 #endif
680  // CmiNodeBarrier();
681 }
682 
683 //apply MC pressure control
685  const int step,
686  const int doMigration,
687  const int doEnergy,
688  const int doVirial,
689  const int maxForceNumber,
690  const int doGlobal)
691 {
692  bool isMasterPe = (deviceCUDA->getMasterPe() == CkMyPe() );
693  NodeReduction *reduction = CUDASequencer->patchData->reduction;
694  Controller *c_out = CUDASequencer->patchData->c_out;
695  bool mGpuOn = CUDASequencer->mGpuOn;
696  Lattice oldLattice = this->patch->lattice;
697  Vector origin = this->patch->lattice.origin();
698  Tensor factor;
699  int accepted = 0; // status of MC volume fluctuation trial
700 
701  if(isMasterPe){
702  // Backup the reduction values for rejected move
703  CUDASequencer->patchData->nodeReductionSave->setVal(reduction);
704 
706  // Send the rescale factor for Monte Carlo Volume change from controller
707  c_out->mcPressure_prepare(step);
708  // receive the factor
709  factor = c_out->getPositionRescaleFactor(step);
710  }
711 
712  // Backup positions and forces, scale the coordinates and lattice
713  // Setup positions for energy and force calculation
714  CUDASequencer->monteCarloPressure_part1(factor, origin, oldLattice);
716  // Scale the lattice with factor
717  // patch.lattice is pointing to patch.flags.lattice
718  this->patch->lattice.rescale(factor);
719  CUDASequencer->patchData->lat = &(this->patch->lattice);
720  CUDASequencer->patchData->factor = &(factor);
721  // Copy scaled lattic flags to all patches
722  CUDASequencer->patchData->flags.copyIntFlags(this->patch->flags);
723 
724  // Zero all reduction values. We will add halfStep values, if
725  // the move is accepted.
726  reduction->zero();
727  }
728  }
729 
730  CmiNodeBarrier();
731  if(isMasterPe){
732  // copy global flags
733  CUDASequencer->update_patch_flags();
734  }
735  CmiNodeBarrier();
736  // Calculate the new force and energy after rescaling the coordinates
737  // Migration happend before calling this function
738  this->runComputeObjectsCUDA(0, doGlobal, 1, step, 0 /* startup */);
739  CmiNodeBarrier();
740 
741  if(isMasterPe){
742  // Accumulate force to SOA, calculate External energy/force
743  // reduce energy and virial
744  CUDASequencer->monteCarloPressure_part2(reduction, step, maxForceNumber,
745  doEnergy, doGlobal, doVirial);
746 
748  // Check to see if the move is accepted or not
749  c_out->mcPressure_accept(step);
750  accepted = c_out->getMCAcceptance(step);
751  //accepted = broadcast->monteCarloBarostatAcceptance.get(step);
752  //printf("Sequencer (accept): step: %d, Pe: %d, ACC status: %d\n", step, CkMyPe(), accepted);
753  }
754 
755  if (accepted) { // Move accepted
756  CUDASequencer->monteCarloPressure_accept(reduction, doMigration);
757  } else { // Move rejected
759  // Set the lattice to the original value, before scaling
760  this->patch->lattice = oldLattice;
761  CUDASequencer->patchData->lat = &(this->patch->lattice);
762  // Copy scaled lattic flags to all patches
763  CUDASequencer->patchData->flags.copyIntFlags(this->patch->flags);
764  }
765 
766  // Restore all positions and forces and cuLattice
767  CUDASequencer->monteCarloPressure_reject(this->patch->lattice);
768  // Restore the reduction values
769  reduction->setVal(CUDASequencer->patchData->nodeReductionSave);
770  }
771  }
772 
773  CmiNodeBarrier();
774  //continue the rejection step. Need to update lattice in all patches
775  if(isMasterPe && !accepted){
776  // copy global flags
777  CUDASequencer->update_patch_flags();
778  }
779 }
780 
781 void Sequencer::doMigrationGPU(const int startup, const int doGlobal,
782  const int updatePatchMap) {
783 
784  const bool isMasterPe = deviceCUDA->getMasterPe() == CkMyPe();
785  const bool updatePatchData = startup || doGlobal || updatePatchMap;
786  PatchMap* patchMap = PatchMap::Object();
787 
788  bool realloc = false;
789 
790  // This will check if a reallocation was done on the previous migration
791  // We use the scratch buffers to store the atomic data during reallocation
792  // However, the migrationDestination data much be maintained throughout
793  // migration (and tuple migration so beyond the scope of this function)
794  // We probably should add a function to do this at the end of migration
795  // But for now, DMC thought it was easier to just do at the begining
796  for (int i = 0; i < deviceCUDA->getNumDevice(); i++) {
797  if(CUDASequencer->patchData->atomReallocationFlagPerDevice[i] != 0) {
798  realloc = true;
799  break;
800  }
801  }
802  if (realloc) {
803  if (isMasterPe) {
804  CUDASequencer->reallocateMigrationDestination();
805  CUDASequencer->registerSOAPointersToHost();
806  }
807  CmiNodeBarrier();
808  if (isMasterPe) {
809  CUDASequencer->copySOAHostRegisterToDevice();
810  }
811  }
812 
813  // Proceed with migration
814  //
815  // Starts GPU migration
816  //
817  if (isMasterPe) {
818  CUDASequencer->migrationLocalInit();
819  // Hidden stream sync
820  }
821  CmiNodeBarrier();
822 
823  if (isMasterPe) {
824  CUDASequencer->migrationPerform();
825  // Hidden stream sync
826  }
827  CmiNodeBarrier();
828 
829  if (isMasterPe) {
830  CUDASequencer->migrationUpdateAtomCounts();
831  // Hidden stream sync
832  }
833  CmiNodeBarrier();
834 
835  if (isMasterPe) {
836  CUDASequencer->migrationUpdateAtomOffsets();
837  // Hidden stream sync
838  }
839  CmiNodeBarrier();
840 
841  if (isMasterPe) {
842  CUDASequencer->copyPatchDataToHost();
843  // Hidden stream sync
844  }
845  CmiNodeBarrier();
846 
847  // Update device buffer allocations
848  realloc = false;
849  if (isMasterPe) {
850  realloc = CUDASequencer->copyPatchData(true, false);
851  CUDASequencer->patchData->atomReallocationFlagPerDevice[deviceCUDA->getDeviceIndex()] = realloc;
852  }
853  CmiNodeBarrier();
854 
855  // If any of the devices have reallocated, we need to re-register the p2p buffers
856  for (int i = 0; i < deviceCUDA->getNumDevice(); i++) {
857  if(CUDASequencer->patchData->atomReallocationFlagPerDevice[i] != 0) {
858  realloc = true;
859  break;
860  }
861  }
862  if (realloc) {
863  if (isMasterPe) {
864  CUDASequencer->registerSOAPointersToHost();
865  }
866  CmiNodeBarrier();
867  if (isMasterPe) {
868  CUDASequencer->copySOAHostRegisterToDevice();
869  }
870  }
871 
872  // Performs various post processing like Solute/Solvent sorting and copies back to host
873  if (isMasterPe) {
874  CUDASequencer->migrationLocalPost(0);
875  CUDASequencer->migrationSortAtomsNonbonded();
876  }
877 
878  // If this is startup, we need to delay this until after AoS has been copied back to host
879  // Because we do need the atomIDs for the atom map initially
880  if (!updatePatchData) {
881  wakeULTs(); // Wakes everyone back up for migration
882  this->patch->positionsReady_GPU(1, startup);
883  if(CUDASequencer->numPatchesCheckedIn < patchMap->numPatchesOnNode(CkMyPe()) -1 ) {
884  CUDASequencer->masterThreadSleeping = true;
885  CUDASequencer->masterThread = CthSelf();
886  CthSuspend();
887  }
888  }
889 
890  if (isMasterPe) {
891  CUDASequencer->sync();
892  }
893  CmiNodeBarrier();
894  if (isMasterPe) {
895  CUDASequencer->migrationUpdateDestination();
896  }
897  CmiNodeBarrier();
898 
899  if (isMasterPe) {
900  CUDASequencer->migrationUpdateProxyDestination();
901  }
902  CmiNodeBarrier();
903 
904  if (isMasterPe) {
905  CUDASequencer->migrationUpdateRemoteOffsets();
906  }
907  CmiNodeBarrier();
908 
909  if (isMasterPe) {
910  CUDASequencer->copyDataToPeers(true);
911  }
912  CmiNodeBarrier();
913 
914  if (updatePatchData) {
915  // The atom maps need to be cleared the HomePatch atom arrays have been updated
916  int numhp = PatchMap::Object()->numHomePatches();
918  for(int i = 0; i < numhp; ++i) {
919  HomePatch *hp = hpList->item(i).patch;
920  hp->clearAtomMap();
921  }
922  CmiNodeBarrier();
923  if (isMasterPe) {
924  // We need the atom ordering to be correct within each
925  // patch to setup the atom map. The vdwType of each atom
926  // is also used for exclusion tuple generation
927  CUDASequencer->copyAoSDataToHost();
928  }
929  CmiNodeBarrier();
930  wakeULTs(); // Wakes everyone back up for migration
931  this->patch->positionsReady_GPU(1, startup);
932  if(CUDASequencer->numPatchesCheckedIn < patchMap->numPatchesOnNode(CkMyPe()) -1 ) {
933  CUDASequencer->masterThreadSleeping = true;
934  CUDASequencer->masterThread = CthSelf();
935  CthSuspend();
936  }
937  CmiNodeBarrier();
938  }
939  if (isMasterPe) {
940  if (doGlobal || simParams->forceDcdFrequency > 0) {
941  CUDASequencer->updateHostPatchDataSOA(); // Needs to be called after HomePatch updates
942  }
943  }
944  CmiNodeBarrier();
945  if (isMasterPe) {
946  // This needs to be called after positionsReady_GPU to that the atom maps have been updated
947  // This will be called in updateDeviceData during with startup=true, but we need to call it
948  // with startup=false to make sure the atoms are updated
949  CUDASequencer->migrationUpdateAdvancedFeatures(false);
950  }
951  CmiNodeBarrier();
952 }
953 
954 // JM: Single-node integration scheme
955 void Sequencer::integrate_CUDA_SOA(int scriptTask){
956 
957  #ifdef TIMER_COLLECTION
958  TimerSet& t = patch->timerSet;
959  #endif
960  TIMER_INIT_WIDTH(t, KICK, simParams->timerBinWidth);
961  TIMER_INIT_WIDTH(t, MAXMOVE, simParams->timerBinWidth);
962  TIMER_INIT_WIDTH(t, DRIFT, simParams->timerBinWidth);
963  TIMER_INIT_WIDTH(t, PISTON, simParams->timerBinWidth);
964  TIMER_INIT_WIDTH(t, SUBMITHALF, simParams->timerBinWidth);
965  TIMER_INIT_WIDTH(t, VELBBK1, simParams->timerBinWidth);
966  TIMER_INIT_WIDTH(t, VELBBK2, simParams->timerBinWidth);
967  TIMER_INIT_WIDTH(t, RATTLE1, simParams->timerBinWidth);
968  TIMER_INIT_WIDTH(t, SUBMITFULL, simParams->timerBinWidth);
969  TIMER_INIT_WIDTH(t, SUBMITCOLLECT, simParams->timerBinWidth);
970 
971  // Keep track of the step number.
972  //int &step = patch->flags.step;
973  int &step = patch->flags.step;
974  step = simParams->firstTimestep;
975  Controller *c_out = CUDASequencer->patchData->c_out;
976  PatchMap* patchMap = PatchMap::Object();
977 
978  // For multiple time stepping, which force boxes are used?
979  int &maxForceUsed = patch->flags.maxForceUsed;
980  int &maxForceMerged = patch->flags.maxForceMerged;
981  maxForceUsed = Results::normal;
982  maxForceMerged = Results::normal;
983 
984  // Keep track of total steps and steps per cycle.
985  const int numberOfSteps = simParams->N;
986  //const int stepsPerCycle = simParams->stepsPerCycle;
987  CheckStep stepsPerCycle;
988  stepsPerCycle.init(step, simParams->stepsPerCycle);
989  // The fundamental time step, get the scaling right for velocity units.
990  const BigReal timestep = simParams->dt * RECIP_TIMEFACTOR;
991 
992  //const int nonbondedFrequency = simParams->nonbondedFrequency;
993  //slowFreq = nonbondedFrequency;
994  CheckStep nonbondedFrequency;
996  // The step size for short-range nonbonded forces.
997  const BigReal nbondstep = timestep * simParams->nonbondedFrequency;
998  int &doNonbonded = patch->flags.doNonbonded;
999  //doNonbonded = (step >= numberOfSteps) || !(step%nonbondedFrequency);
1000  doNonbonded = (step >= numberOfSteps) ||
1001  nonbondedFrequency.init(step, simParams->nonbondedFrequency);
1002  //if ( nonbondedFrequency == 1 ) maxForceMerged = Results::nbond;
1003  if ( nonbondedFrequency.period == 1 ) maxForceMerged = Results::nbond;
1004  if ( doNonbonded ) maxForceUsed = Results::nbond;
1005 
1006  // Do we do full electrostatics?
1007  const int dofull = ( simParams->fullElectFrequency ? 1 : 0 );
1008  //const int fullElectFrequency = simParams->fullElectFrequency;
1009  //if ( dofull ) slowFreq = fullElectFrequency;
1010  CheckStep fullElectFrequency;
1011  if ( dofull ) slowFreq = simParams->fullElectFrequency;
1012  // The step size for long-range electrostatics.
1013  const BigReal slowstep = timestep * simParams->fullElectFrequency;
1014  int &doFullElectrostatics = patch->flags.doFullElectrostatics;
1015  //doFullElectrostatics = (dofull &&
1016  // ((step >= numberOfSteps) || !(step%fullElectFrequency)));
1017  doFullElectrostatics = (dofull &&
1018  ((step >= numberOfSteps) ||
1019  fullElectFrequency.init(step, simParams->fullElectFrequency)));
1020  //if ( dofull && fullElectFrequency == 1 ) maxForceMerged = Results::slow;
1021  if ( dofull && fullElectFrequency.period == 1 ) maxForceMerged = Results::slow;
1022  if ( doFullElectrostatics ) maxForceUsed = Results::slow;
1023 
1024  // Bother to calculate energies?
1025  int &doEnergy = patch->flags.doEnergy;
1026  //int energyFrequency = simParams->outputEnergies;
1027  CheckStep energyFrequency;
1028  int newComputeEnergies = simParams->computeEnergies;
1029  if(simParams->alchOn) newComputeEnergies = NAMD_gcd(newComputeEnergies, simParams->alchOutFreq);
1030  doEnergy = energyFrequency.init(step, newComputeEnergies);
1031 
1032  // check for Monte Carlo pressure control.
1033  CheckStep monteCarloPressureFrequency;
1034  doEnergy += monteCarloPressureFrequency.init(step, (simParams->monteCarloPressureOn ?
1035  simParams->monteCarloPressureFreq : numberOfSteps + 1) );
1036 
1037  int &doVirial = patch->flags.doVirial;
1038  doVirial = 1;
1039  // Do we need to return forces to TCL script or Colvar module?
1040  int doTcl = simParams->tclForcesOn;
1041  int doColvars = simParams->colvarsOn;
1042  int doGlobal = (doTcl || doColvars);
1044  CheckStep globalMasterFrequency;
1045  bool globalMasterStep=false;
1046  int doGlobalObjects=0;
1047  int doGlobalStaleForces = 0;
1048 
1049  if(doGlobal)
1050  {
1051  globalMasterFrequency.init(step, (simParams->globalMasterFrequency > 0 ? simParams->globalMasterFrequency : numberOfSteps+1));
1052  globalMasterStep = globalMasterFrequency.check(step);
1053  doGlobalObjects = globalMasterStep? 1:0;
1055  {
1056  doGlobalObjects=1;
1057  doGlobalStaleForces=1;
1058  }
1060  {
1061  doGlobalStaleForces = simParams->globalMasterStaleForces;
1062  }
1064  {
1065  doGlobalStaleForces=doGlobalObjects;
1066  }
1067  else
1068  {
1069  doGlobalStaleForces=doGlobalObjects;
1070  }
1071  }
1072  else
1073  {
1074  doGlobalStaleForces = 0;
1075  doGlobalObjects = 0;
1076  }
1077  // The following flags have to be explicitly disabled in Patch object.
1078  patch->flags.doMolly = 0;
1079  patch->flags.doLoweAndersen = 0;
1080  patch->flags.doGBIS = 0;
1081  patch->flags.doLCPO = 0;
1082 
1083  // Square of maximum velocity for simulation safety check
1084  const BigReal maxvel2 =
1085  (simParams->cutoff * simParams->cutoff) / (timestep * timestep);
1086 
1087  // check for Langevin piston
1088  // set period beyond numberOfSteps to disable
1089  // fprintf(stderr, " Patch %d Pinging in from integrate_cuda!\n", this->patch->getPatchID());
1090  CheckStep langevinPistonFrequency;
1091  langevinPistonFrequency.init(step,
1092  (simParams->langevinPistonOn ? slowFreq : numberOfSteps+1 ),
1093  (simParams->langevinPistonOn ? -1-slowFreq/2 : 0) /* = delta */);
1094 
1095  // check for velocity rescaling
1096  // set period beyond numberOfSteps to disable
1097  CheckStep stochRescaleFrequency;
1098  stochRescaleFrequency.init(step, (simParams->stochRescaleOn ?
1099  simParams->stochRescaleFreq : numberOfSteps+1 ) );
1100 
1101  CheckStep reassignVelocityFrequency;
1102  reassignVelocityFrequency.init(step, ((simParams->reassignFreq>0) ?
1103  simParams->reassignFreq : numberOfSteps+1 ) );
1104 
1105  // check for output
1106  // set period beyond numberOfSteps to disable
1107  CheckStep restartFrequency;
1108  restartFrequency.init(step, (simParams->restartFrequency > 0 ?
1109  simParams->restartFrequency : numberOfSteps+1) );
1110  CheckStep dcdFrequency;
1111  dcdFrequency.init(step, (simParams->dcdFrequency > 0 ?
1112  simParams->dcdFrequency : numberOfSteps+1) );
1113  CheckStep velDcdFrequency;
1114  velDcdFrequency.init(step, (simParams->velDcdFrequency > 0 ?
1115  simParams->velDcdFrequency : numberOfSteps+1) );
1116  CheckStep forceDcdFrequency;
1117  forceDcdFrequency.init(step, (simParams->forceDcdFrequency > 0 ?
1118  simParams->forceDcdFrequency : numberOfSteps+1) );
1119  CheckStep imdFrequency;
1120  imdFrequency.init(step, (simParams->IMDfreq > 0 ?
1121  simParams->IMDfreq : numberOfSteps+1) );
1122 
1123  patch->copy_atoms_to_SOA(); // do this whether or not useDeviceMigration
1124 
1125  // Haochuan: is this really needed for GPU-resident?
1126  if (simParams->rigidBonds != RIGID_NONE && ! patch->settle_initialized) {
1128  patch->rattleListValid_SOA = true;
1129  }
1130 
1131  this->suspendULTs();
1132  // for "run 0", numberOfSteps is zero, but we want to have at least a single energy evaluation
1133  if(!masterThread) {
1134  return;
1135  }
1136  bool isMasterPe = (deviceCUDA->getMasterPe() == CkMyPe() );
1137  CmiNodeBarrier();
1138 
1139  CUDASequencer->breakSuspends = false;
1140 
1141  // XXX this is ugly!
1142  // one thread will have the CollectionMaster and Output defined
1143  // use it to set the node group so that any thread can access
1144  if (CUDASequencer->patchData->ptrCollectionMaster == NULL) {
1145  CollectionMaster *pcm = CkpvAccess(CollectionMaster_instance)->Object();
1146  if (pcm) {
1147  CUDASequencer->patchData->ptrCollectionMaster = pcm;
1148  }
1149  }
1150  if (CUDASequencer->patchData->ptrOutput == NULL) {
1151  Output *pout = Node::Object()->output;
1152  if (pout) {
1153  CUDASequencer->patchData->ptrOutput = pout;
1154  }
1155  }
1156  if (CUDASequencer->patchData->pdb == NULL) {
1157  PDB *pdb = Node::Object()->pdb;
1158  if (pdb) {
1159  CUDASequencer->patchData->pdb = pdb;
1160  }
1161  }
1162  if (CUDASequencer->patchData->imd == NULL) {
1163  IMDOutput *imd = Node::Object()->imd;
1164  if (imd->getIMD()) {
1165  CUDASequencer->patchData->imd = imd;
1166  }
1167  }
1168 
1169  // Register ComputeCUDAMgrs from each PE into a list for later usage
1170  if(isMasterPe){
1171  // Each masterPE registers its own computeCUDAMgr
1172  CUDASequencer->patchData->cudaBondedList[CkMyPe()] = ComputeCUDAMgr::getComputeCUDAMgr()->getComputeBondedCUDA();
1173  CUDASequencer->patchData->cudaNonbondedList[CkMyPe()] = ComputeCUDAMgr::getComputeCUDAMgr()->getCudaComputeNonbonded();
1174  }else{
1175  CUDASequencer->patchData->cudaBondedList[CkMyPe()] = NULL;
1176  CUDASequencer->patchData->cudaNonbondedList[CkMyPe()] = NULL;
1177  }
1178 
1179  if (isMasterPe) {
1181  if(dofull && deviceCUDA->getIsPmeDevice()){
1182  CudaPmeOneDevice* cudaPme = 0;
1183  cudaPme = cudaMgr->createCudaPmeOneDevice();
1184  }
1185  CUDASequencer->patchData->reduction->zero();
1186  }
1187 
1188  CmiNodeBarrier();
1189 
1190 /* JM NOTE: This Will Contains the first calls to the integration loop. The order is:
1191  * 1 - Rattle (0,0)
1192  * 2 - runComputeObjects
1193  * 3 - addForceToMomentum(-0.5, tstep)
1194  * 4 - Rattle (-timestep, 0);
1195  * 5 - submitHalfstep();
1196  * 6 - addForceToMomentum(1.0 , tstep)
1197  * 7 - Rattle (tstep, 1)
1198  * 8 - SubmitHalf()
1199  * 9 - addForceToMomentum(-0.5, tstep)
1200  * 10 - submitReductions()
1201  */
1202 
1203  if(scriptTask == SCRIPT_RUN){
1204  updateDeviceData(1, maxForceUsed, doGlobal);
1205 
1206  if(isMasterPe) {
1207  if(patchData->updateCounter.load()>0)
1208  {
1209  CUDASequencer->updateDeviceKernels();
1210  }
1211 
1212  // warm_up1 is basically rattle1_SOA(0,0)
1213  CUDASequencer->startRun1(maxForceUsed, this->patch->lattice);
1214  (this->patch->flags.sequence)++;
1215  if (deviceCUDA->getIsMasterDevice()){
1216  CUDASequencer->patchData->lat = &(this->patch->lattice);
1217  CUDASequencer->patchData->flags.copyIntFlags(this->patch->flags);
1218  }
1220 #ifdef NAMD_CUDA
1221  const auto cudaGlobalMasterObject = cudaMgr->getCudaGlobalMaster();
1222 #endif // NAMD_CUDA
1223  const bool addCudaGlobalForces =
1224 #ifdef NAMD_CUDA
1225  (cudaGlobalMasterObject != nullptr) ?
1226  cudaGlobalMasterObject->willAddGlobalForces() :
1227 #endif // NAMD_CUDA
1228  false;
1229  if (addCudaGlobalForces) {
1230  CUDASequencer->allocateGPUSavedForces();
1231  }
1232  }
1233 
1234  CmiNodeBarrier();
1235  if (!simParams->useDeviceMigration) {
1236  wakeULTs(); // Wakes everyone back up for migration
1237  this->patch->positionsReady_SOA(1);
1238  if(CUDASequencer->numPatchesCheckedIn < patchMap->numPatchesOnNode(CkMyPe()) -1 ) {
1239  CUDASequencer->masterThreadSleeping = true;
1240  CUDASequencer->masterThread = CthSelf();
1241  CthSuspend();
1242  }
1243  CmiNodeBarrier();
1244  updateDeviceData(0, maxForceUsed, doGlobal);
1245  } else {
1246  doMigrationGPU(1, doGlobal, simParams->updateAtomMap);
1247  }
1248  CmiNodeBarrier();
1249  // I've migrated everything. Now run computes
1250  runComputeObjectsCUDA(/*isMigration = */ 1 ,
1251  doGlobal,
1252  /* step < numberofSteps */ 1,
1253  /* step = */ 0,
1254  /* startup = */ 1);
1255 
1256  if(isMasterPe){
1257  CUDASequencer->finish_patch_flags(true);
1259 #ifdef NAMD_CUDA
1260  const auto cudaGlobalMasterObject = cudaMgr->getCudaGlobalMaster();
1261 #endif // NAMD_CUDA
1262  const bool addCudaGlobalForces =
1263 #ifdef NAMD_CUDA
1264  (cudaGlobalMasterObject != nullptr) ?
1265  cudaGlobalMasterObject->willAddGlobalForces() :
1266 #endif // NAMD_CUDA
1267  false;
1268  CUDASequencer->startRun2(timestep,
1269  nbondstep, slowstep, this->patch->lattice.origin(),
1270  CUDASequencer->patchData->reduction, doGlobal || addCudaGlobalForces, maxForceUsed);
1271  }
1272  CmiNodeBarrier();
1273  if(isMasterPe){
1274  const bool requestTotalForces = computeGlobal ? computeGlobal->getForceSendActive() : false;
1276 #ifdef NAMD_CUDA
1277  const auto cudaGlobalMasterObject = cudaMgr->getCudaGlobalMaster();
1278 #endif // NAMD_CUDA
1279  const bool requestGPUTotalForces =
1280 #ifdef NAMD_CUDA
1281  (cudaGlobalMasterObject != nullptr) ?
1282  cudaGlobalMasterObject->requestedTotalForces() :
1283 #endif // NAMD_CUDA
1284  false;
1285  CUDASequencer->startRun3(timestep,
1286  nbondstep, slowstep, this->patch->lattice.origin(),
1287  CUDASequencer->patchData->reduction,
1288  requestTotalForces, doGlobalStaleForces,
1290  requestGPUTotalForces,
1291  maxForceUsed);
1292  }
1293 
1294  // save total force in computeGlobal, forces are copied from device
1295  // to host in startRun3
1296  if (doGlobal) {
1297  CmiNodeBarrier();
1298  // store the total force for compute global clients
1299  int numhp = PatchMap::Object()->numHomePatches();
1301  for(int i = 0; i < numhp; ++i) {
1302  HomePatch *hp = hpList->item(i).patch;
1303  computeGlobal->saveTotalForces(hp);
1304  }
1305  }
1306  }
1307 
1308  CmiNodeBarrier();
1309  // Called everything, now I can go ahead and print the step
1310  // PE 0 needs to handle IO as it owns the controller object
1311  // JM: What happens if PE 0 does not own a GPU here? XXX Check
1312  if(deviceCUDA->getIsMasterDevice()) {
1313  CUDASequencer->patchData->flags.copyIntFlags(this->patch->flags);
1314  c_out->resetMovingAverage();
1315  c_out->printStep(step, CUDASequencer->patchData->reduction);
1316  CUDASequencer->patchData->reduction->zero();
1317  }
1318  CmiNodeBarrier();
1319 
1320  // XXX Should we promote velrescaling into Sequencer in order to save
1321  // the velocity rescaling coefficient between script run commands?
1322  double velrescaling = 1;
1323  // --------- Start of the MD loop ------- //
1324  for( ++step; step <= numberOfSteps; ++step ){
1325  const int isForcesOutputStep = forceDcdFrequency.check(step);
1326  int dcdSelectionChecks=0;
1327  Molecule *molecule = Node::Object()->molecule;
1328  for(int dcdindex=0; dcdindex<16;++dcdindex)
1329  {
1330  int dcdSelectionFrequency = molecule->dcdSelectionParams[dcdindex].frequency;
1331  if(dcdSelectionFrequency && step % dcdSelectionFrequency)
1332  dcdSelectionChecks++;
1333  }
1334  const int isCollection = restartFrequency.check(step) +
1335  dcdFrequency.check(step) + velDcdFrequency.check(step) +
1336  isForcesOutputStep + imdFrequency.check(step) + dcdSelectionChecks;
1337  int isMigration = false;
1338  const int doVelocityRescale = stochRescaleFrequency.check(step);
1339  const int doMCPressure = monteCarloPressureFrequency.check(step);
1340  // XXX doVelRescale should instead set a "doTemperature" flag
1341  doEnergy = energyFrequency.check(step) || doVelocityRescale || doMCPressure;
1342  int langevinPistonStep = langevinPistonFrequency.check(step);
1343 
1344  int reassignVelocityStep = reassignVelocityFrequency.check(step);
1345 
1346  // berendsen pressure control
1347  int berendsenPressureStep = 0;
1352  berendsenPressureStep = 1;
1353  }
1354  }
1355  if(patchData->updateCounter.load()>0)
1356  {
1357  CUDASequencer->updateDeviceKernels();
1358  }
1359 
1360  if(doGlobal)
1361  {
1362  globalMasterStep = globalMasterFrequency.check(step);
1363  doGlobalObjects = globalMasterStep? 1:0;
1365  {
1366  doGlobalObjects=1;
1367  doGlobalStaleForces=1;
1368  }
1370  {
1371  doGlobalStaleForces = simParams->globalMasterStaleForces;
1372  }
1374  {
1375  doGlobalStaleForces=doGlobalObjects;
1376  }
1377  else
1378  {
1379  doGlobalStaleForces=doGlobalObjects;
1380  }
1381  }
1382  else
1383  {
1384  doGlobalStaleForces = 0;
1385  doGlobalObjects = 0;
1386  globalMasterStep = false;
1387  }
1388  // CkPrintf("step %d doGlobal %d doGlobalObjects %d doGlobalStaleForces %d globalMasterStep %d globalMasterFrequency %d\n", step, doGlobal, doGlobalObjects, doGlobalStaleForces, globalMasterStep, simParams->globalMasterFrequency);
1389 
1390 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
1391  //eon = epid && (beginStep < step && step <= endStep);
1392  // int eon = epid && (beginStep < step && step <= endStep);
1393  // if (controlProfiling && step == beginStep) {
1394  // NAMD_PROFILE_START();
1395  // }
1396  //if (controlProfiling && step == endStep) {
1397  // NAMD_PROFILE_STOP();
1398  //}
1399 #endif
1400 
1401  Vector origin = this->patch->lattice.origin();
1402  Tensor factor;
1403  if (deviceCUDA->getIsMasterDevice()) {
1404  if (simParams->langevinPistonOn) {
1405  c_out->piston1(step);
1406  }
1407  // Get the rescale factor for berendsen from controller
1409  c_out->berendsenPressureController(step);
1410  }
1411  }
1412  if (langevinPistonStep || berendsenPressureStep) {
1414  factor = c_out->getPositionRescaleFactor(step);
1415  this->patch->lattice.rescale(factor);
1416  CUDASequencer->patchData->lat = &(this->patch->lattice);
1417  CUDASequencer->patchData->factor = &(factor); // This looks unsafe, but as devices run on lockstep, it's fine
1418  }
1419  }
1420 
1421  CmiNodeBarrier();
1422  NAMD_EVENT_START(1, NamdProfileEvent::CUDASOA_LAUNCHPT1);
1423  int previousMaxForceUsed;
1424  if(isMasterPe){
1425  // need to remember number of buffers for previous force calculation
1426  previousMaxForceUsed = maxForceUsed;
1427  // update local flags
1428  //doNonbonded = !(step%nonbondedFrequency);
1429  // no need to include doMCPressure since it's common factor of nonbondedFrequency
1430  doNonbonded = nonbondedFrequency.check(step);
1431  // no need to include doMCPressure since it's common factor of fullElectFrequency
1432  doFullElectrostatics = (dofull && fullElectFrequency.check(step));
1433  maxForceUsed = Results::normal;
1434  if ( doNonbonded ) maxForceUsed = Results::nbond;
1435  if ( doFullElectrostatics ) maxForceUsed = Results::slow;
1436 
1437  (this->patch->flags.sequence)++;
1438  // JM: Pressures needed for every timestep if the piston is on
1440 
1441  // copy local flags to global
1442  if(deviceCUDA->getIsMasterDevice()) CUDASequencer->patchData->flags.copyIntFlags(this->patch->flags);
1443  }
1444 
1445  CmiNodeBarrier();
1446 
1447  if(isMasterPe){
1448  CUDASequencer->launch_part1(
1449  step,
1450  timestep, nbondstep, slowstep, velrescaling, maxvel2,
1451  CUDASequencer->patchData->reduction,
1452  *(CUDASequencer->patchData->factor),
1453  origin,
1454  // this->patch->lattice, // need to use the lattice from PE 0 right now
1455  (langevinPistonStep || berendsenPressureStep) ? *(CUDASequencer->patchData->lat) : this->patch->lattice,
1456  reassignVelocityStep,
1457  langevinPistonStep,
1458  berendsenPressureStep,
1459  previousMaxForceUsed, // call with previous maxForceUsed
1460  (const int)(step == simParams->firstTimestep + 1),
1461  this->patch->flags.savePairlists, // XXX how to initialize?
1462  this->patch->flags.usePairlists, // XXX how to initialize?
1463  doEnergy);
1464  // reset velocity rescaling coefficient after applying it
1465  velrescaling = 1;
1466  }
1467  if (reassignVelocityStep)
1468  {
1469  // CkPrintf("dump after launch_part1\n");
1470  // CUDASequencer->printSOAPositionsAndVelocities(2,10);
1471  }
1472  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDASOA_LAUNCHPT1);
1473 
1474  CmiNodeBarrier();
1475 
1476  if(isMasterPe){
1477  CUDASequencer->launch_part11(
1478  timestep, nbondstep, slowstep, velrescaling, maxvel2,
1479  CUDASequencer->patchData->reduction,
1480  *(CUDASequencer->patchData->factor),
1481  origin,
1482  // this->patch->lattice, // need to use the lattice from PE 0 right now
1483  (langevinPistonStep || berendsenPressureStep) ? *(CUDASequencer->patchData->lat) : this->patch->lattice,
1484  langevinPistonStep,
1485  previousMaxForceUsed, // call with previous maxForceUsed
1486  (const int)(step == simParams->firstTimestep + 1),
1487  this->patch->flags.savePairlists, // XXX how to initialize?
1488  this->patch->flags.usePairlists, // XXX how to initialize?
1489  doEnergy);
1490  // reset velocity rescaling coefficient after applying it
1491  velrescaling = 1;
1492  }
1493  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDASOA_LAUNCHPT1);
1494 
1495  CmiNodeBarrier();
1496 
1497 
1498  for(int i = 0; i < deviceCUDA->getNumDevice(); i++){
1499  if(CUDASequencer->patchData->migrationFlagPerDevice[i] != 0) {
1500  isMigration = true;
1501  break;
1502  }
1503  }
1504 
1505  if(isMasterPe){
1506  // If this is a Device Migration step we'll do it later
1507  if (!simParams->useDeviceMigration || !isMigration) {
1508  CUDASequencer->launch_set_compute_positions();
1509  }
1510  }
1511 
1512  // isMigration = (CUDASequencer->patchData->migrationFlagPerDevice.end() != t) ? 1:0;
1513 
1514  if(isMasterPe) {
1515  // if(CkMyPe() == 0) CUDASequencer->updatePairlistFlags(isMigration);
1516  CUDASequencer->updatePairlistFlags(isMigration);
1517  if (!simParams->useDeviceMigration) {
1518  CUDASequencer->copyPositionsAndVelocitiesToHost(isMigration, doGlobalObjects);
1519  }
1520  }
1521 
1522 
1523  if(isMigration) {
1524  if (!simParams->useDeviceMigration) {
1525  CmiNodeBarrier();
1526  wakeULTs(); // sets the number of patches
1527  this->patch->positionsReady_SOA(isMigration);
1528  if(CUDASequencer->numPatchesCheckedIn < patchMap->numPatchesOnNode(CkMyPe()) -1 ) {
1529  CUDASequencer->masterThreadSleeping = true;
1530  CUDASequencer->masterThread = CthSelf();
1531  CthSuspend(); // suspends until everyone else has pinged back. :]
1532  }
1533  CmiNodeBarrier();
1534  updateDeviceData(0, maxForceUsed, doGlobal);
1535  } else {
1536  doMigrationGPU(false, doGlobal, simParams->updateAtomMap);
1537  CmiNodeBarrier();
1538  }
1539  }
1540 
1541  // Calculate force/energy for bond, nonBond, pme.
1542 
1543  this->runComputeObjectsCUDA(isMigration, doGlobalObjects, step<numberOfSteps, step, 0 /* startup */);
1544 
1545  if (isMasterPe) {
1546  // if(CkMyPe() == 0) CUDASequencer->finish_patch_flags(isMigration);
1547  CUDASequencer->finish_patch_flags(isMigration);
1548  CUDASequencer->patchData->migrationFlagPerDevice[deviceCUDA->getDeviceIndex()] = 0; // flags it back to zero
1549  }
1550  CmiNodeBarrier();
1551 
1552  NAMD_EVENT_START(1, NamdProfileEvent::CUDASOA_LAUNCHPT2);
1553  if(isMasterPe){
1555 #ifdef NAMD_CUDA
1556  const auto cudaGlobalMasterObject = cudaMgr->getCudaGlobalMaster();
1557 #endif // NAMD_CUDA
1558  const bool addCudaGlobalForces =
1559 #ifdef NAMD_CUDA
1560  (cudaGlobalMasterObject != nullptr) ?
1561  cudaGlobalMasterObject->willAddGlobalForces() :
1562 #endif // NAMD_CUDA
1563  false;
1564  CUDASequencer->launch_part2(doMCPressure,
1565  timestep, nbondstep, slowstep,
1566  CUDASequencer->patchData->reduction,
1567  origin,
1568  step,
1569  maxForceUsed,
1570  langevinPistonStep,
1571  isMigration && (!simParams->useDeviceMigration),
1572  isCollection,
1573  doGlobalStaleForces || addCudaGlobalForces,
1574  doEnergy);
1575  }
1576  CmiNodeBarrier();
1577  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDASOA_LAUNCHPT2);
1578 
1579  // Apply MC pressure control
1580  if(doMCPressure){
1581  monteCarloPressureControl(step, isMigration, 1, 1, maxForceUsed, doGlobalStaleForces);
1582  CmiNodeBarrier();
1583  }
1584 
1585  const bool requestTotalForces = (computeGlobal ? computeGlobal->getForceSendActive() : false) && doGlobalObjects;
1586  // continue launch_part2, after cellBasis fluctuation in MC barostat
1587  if(isMasterPe){
1589 #ifdef NAMD_CUDA
1590  const auto CudaGlobalMasterObject = cudaMgr->getCudaGlobalMaster();
1591 #endif // NAMD_CUDA
1592  const bool requestGPUTotalForces =
1593 #ifdef NAMD_CUDA
1594  (CudaGlobalMasterObject != nullptr) ?
1595  CudaGlobalMasterObject->requestedTotalForces() :
1596 #endif // NAMD_CUDA
1597  false;
1598  CUDASequencer->launch_part3(doMCPressure,
1599  timestep, nbondstep, slowstep,
1600  CUDASequencer->patchData->reduction,
1601  origin,
1602  step,
1603  maxForceUsed,
1604  requestTotalForces, // requested Force
1605  doGlobalStaleForces,
1606  requestGPUTotalForces,
1607  isMigration,
1608  isCollection,
1609  doEnergy,
1610  isForcesOutputStep);
1611  }
1612  CmiNodeBarrier();
1613  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDASOA_LAUNCHPT2);
1614 
1615  // save total force in computeGlobal, forces are copied from device
1616  // to host in launch_part3
1617  if (requestTotalForces) {
1618  CmiNodeBarrier();
1619  // store the total force for compute global clients
1620  int numhp = PatchMap::Object()->numHomePatches();
1622  for(int i = 0; i < numhp; ++i) {
1623  HomePatch *hp = hpList->item(i).patch;
1624  computeGlobal->saveTotalForces(hp);
1625  }
1626  }
1627 
1628 
1629  NAMD_EVENT_START(1, NamdProfileEvent::CUDASOA_PRTSTEP);
1630  CmiNodeBarrier();
1631 
1632  if (deviceCUDA->getIsMasterDevice()) {
1633  // even though you're not on a printstep, calling this still takes 15us approx!!!
1634  c_out->printStep(step, CUDASequencer->patchData->reduction);
1635  // stochastic velocity rescaling
1636  // get coefficient from current temperature
1637  // to be applied on NEXT loop iteration
1638  if (doVelocityRescale) {
1639  // calculate coefficient based on current temperature
1640  velrescaling = c_out->stochRescaleCoefficient();
1641  }
1642  }
1643  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDASOA_PRTSTEP);
1644 
1645  NAMD_EVENT_START(1, NamdProfileEvent::RESET_REDUCTIONS);
1646  if(deviceCUDA->getIsMasterDevice()) CUDASequencer->patchData->reduction->zero();
1647  NAMD_EVENT_STOP(1, NamdProfileEvent::RESET_REDUCTIONS);
1648 
1649  NAMD_EVENT_START(1, NamdProfileEvent::CUDASOA_SUBCOL);
1650  if (isCollection) {
1651  CmiNodeBarrier();
1653  if (isMasterPe) {
1654  CUDASequencer->copyAoSDataToHost();
1655  }
1656  // Make sure the data has been copied to all home patches. All PEs
1657  // participate in outputting
1658  CmiNodeBarrier();
1659  }
1660  HomePatchList *hplist = patchMap->homePatchList();
1661  for (auto i= hplist->begin(); i != hplist->end(); i++) {
1662  HomePatch *hp = i->patch;
1663  hp->sequencer->submitCollections_SOA(step);
1664  }
1665  CmiNodeBarrier();
1666  }
1667  NAMD_EVENT_STOP(1, NamdProfileEvent::CUDASOA_SUBCOL);
1668  }
1669 
1671  CmiNodeBarrier();
1672  if (isMasterPe) {
1673  CUDASequencer->copyAoSDataToHost();
1674  }
1675  } else {
1676  if(isMasterPe) {
1677  CUDASequencer->updateHostPatchDataSOA();
1678  CUDASequencer->saveForceCUDASOA_direct(false, true, maxForceUsed);
1679  }
1680  if(isMasterPe) CUDASequencer->copyPositionsAndVelocitiesToHost(true,doGlobal);
1681  CmiNodeBarrier();
1682  HomePatchList *hplist = patchMap->homePatchList();
1683  for (auto i= hplist->begin(); i != hplist->end(); i++) {
1684  HomePatch *hp = i->patch;
1685  hp->copy_updates_to_AOS();
1686  hp->copy_forces_to_AOS(); // to support "output withforces"
1687  }
1688  }
1689  CmiNodeBarrier(); // Make sure the data has been copied to all home patches
1690 
1691  //CmiNodeBarrier();
1692  CUDASequencer->breakSuspends = true;
1693  wakeULTs();
1694  if(deviceCUDA->getIsMasterDevice()) c_out->awaken();
1695 }
1696 
1697 
1698 /*
1699  * Updates device data after a migration
1700  *
1701  */
1702 void Sequencer::updateDeviceData(const int startup, const int maxForceUsed, const int doGlobal) {
1703  bool isMaster = deviceCUDA->getMasterPe() == CkMyPe();
1705  if (isMaster) {
1706  CUDASequencer->copyPatchData(true, startup);
1708  CUDASequencer->reallocateMigrationDestination();
1709  CUDASequencer->copyAtomDataToDeviceAoS();
1710  } else {
1711  CUDASequencer->copyAtomDataToDevice(startup, maxForceUsed);
1712  }
1713  CUDASequencer->migrationLocalPost(startup);
1714  CUDASequencer->migrationUpdateAdvancedFeatures(startup);
1715  // XXX This is only necessary if reallocation happens
1716  CUDASequencer->registerSOAPointersToHost();
1717  }
1718  CmiNodeBarrier();
1719  if (isMaster) {
1720  CUDASequencer->copySOAHostRegisterToDevice();
1722  CUDASequencer->patchData->atomReallocationFlagPerDevice[deviceCUDA->getDeviceIndex()] = 0;
1723  }
1724 
1725  if (doGlobal || simParams->forceDcdFrequency > 0) {
1726  CUDASequencer->updateHostPatchDataSOA(); // Needs to be called after HomePatch::domigration
1727  }
1728  }
1729  CmiNodeBarrier();
1730 }
1731 
1732 /*
1733  * Constructs the meta data structures storing the patch data for GPU resident code path
1734  *
1735  * This is called once during startup
1736  *
1737  */
1740  ComputeBondedCUDA* cudaBond = cudaMgr->getComputeBondedCUDA();
1741  CudaComputeNonbonded* cudaNbond = cudaMgr->getCudaComputeNonbonded();
1742 
1743  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1744  patchData = cpdata.ckLocalBranch();
1745 
1746  const bool isMasterPe = deviceCUDA->getMasterPe() == CkMyPe();
1747 
1748  // constructDevicePatchMap should only be called once per PE
1749  if (patchData->devicePatchMapFlag[CkMyPe()]) return;
1750  patchData->devicePatchMapFlag[CkMyPe()] = 1;
1751 
1752  // One thread per GPU will execute this block
1753  if (isMasterPe) {
1754  const int deviceIndex = deviceCUDA->getDeviceIndex();
1755 
1756  // Nonbonded patches are computed by CudaComputeNonbonded and contain all the patches and proxy
1757  // patches on this device. HomePatches is computed by SequencerCUDA and only contains the
1758  // home patches. localPatches will be generated by this function
1759  using NBPatchRecord = CudaComputeNonbonded::PatchRecord;
1761  std::vector<NBPatchRecord>& nonbondedPatches = cudaNbond->getPatches();
1762  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1763  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1764 
1765  // The home patches are not necessarily ordered by their patchID. This can happen if there
1766  // are multiple PEs assigned to the same GPU. Sorting the home patches by their patch ID
1767  // makes it easy to have a consistent ordering
1768  std::stable_sort(
1769  homePatches.begin(),
1770  homePatches.end(),
1771  [](HomePatch* a, HomePatch* b) {
1772  return (a->getPatchID() < b->getPatchID());
1773  });
1774 
1775  // Iterates over all the patches on this device and adds them to h_localPatches
1776  // and determine if they are a home or proxy patch
1777  for (int i = 0; i < nonbondedPatches.size(); i++) {
1778  CudaLocalRecord record;
1779  record.patchID = nonbondedPatches[i].patchID;
1780 
1781  // TODO DMC the patchmap should be able to do this
1782  const int targetPatchID = record.patchID;
1783  auto result = std::find_if(
1784  homePatches.begin(),
1785  homePatches.end(),
1786  [targetPatchID](HomePatch* p) {
1787  return (p->getPatchID() == targetPatchID);
1788  });
1789 
1790  record.isProxy = (result == homePatches.end());
1791  localPatches.push_back(record);
1792  }
1793 
1794  // The home patches should be at the begining of the patch list
1795  // This makes integration easier since we can ignore the patches and operate on a
1796  // contiguous chunk of home atoms
1797  std::stable_sort(
1798  localPatches.begin(),
1799  localPatches.end(),
1800  [](CudaLocalRecord a, CudaLocalRecord b) {
1801  return (a.isProxy < b.isProxy);
1802  });
1803 
1804  // Now the ordering is fixed we can update the bonded and nonbonded orders. Since we have
1805  // moved the home patches to the begining the ordering has changed
1806  cudaBond->updatePatchOrder(localPatches);
1807  cudaNbond->updatePatchOrder(localPatches);
1808  patchData->devData[deviceIndex].numPatchesHome = homePatches.size();
1809  patchData->devData[deviceIndex].numPatchesHomeAndProxy = localPatches.size();
1810  }
1811  CmiNodeBarrier();
1812 
1813  // Iterates over all patches again, and generates the mapping between GPUs. For each patch,
1814  // it checks the other devices to see if the patch is on that device.
1815  // - For HomePatches, there will be a peer record for all of its proxies
1816  // - For ProxyPatches, there will only be a peer record for its home patch
1817  // There is a single array of peer records per device. Each patch stores an offset into this
1818  // array as well as its number of peer records
1819  if (isMasterPe) {
1820  const int deviceIndex = deviceCUDA->getDeviceIndex();
1821  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
1822  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1823 
1824  for (int i = 0; i < localPatches.size(); i++) {
1825  std::vector<CudaPeerRecord> tempPeers;
1826  const int targetPatchID = localPatches[i].patchID;
1827  const int targetIsProxy = localPatches[i].isProxy;
1828 
1829  for (int devIdx = 0; devIdx < deviceCUDA->getNumDevice(); devIdx++) {
1830  if (devIdx == deviceIndex) continue;
1831  std::vector<CudaLocalRecord>& peerPatches = patchData->devData[devIdx].h_localPatches;
1832 
1833  // Searches peerPatches for patchID. If it is not being integrated on this device
1834  // then ignore other non-integration patches
1835  for (int j = 0; j < patchData->devData[devIdx].numPatchesHomeAndProxy; j++) {
1836  const CudaLocalRecord peer = peerPatches[j];
1837  if (peer.patchID == targetPatchID && peer.isProxy != targetIsProxy) {
1838  CudaPeerRecord peerRecord;
1839  peerRecord.deviceIndex = devIdx;
1840  peerRecord.patchIndex = j;
1841  tempPeers.push_back(peerRecord);
1842  break;
1843  }
1844  }
1845  }
1846 
1847  // Once we have the list of peer records, add them to the single device-width vector
1848  // and record the offset and count
1849  localPatches[i].numPeerRecord = tempPeers.size();
1850  if (!tempPeers.empty()) {
1851  localPatches[i].peerRecordStartIndex = myPeerPatches.size();
1852  myPeerPatches.insert(myPeerPatches.end(), tempPeers.begin(), tempPeers.end());
1853  }
1854  }
1855  }
1856  CmiNodeBarrier();
1857 }
1858 
1860  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1861  patchData = cpdata.ckLocalBranch();
1862 
1863  const bool isMasterPe = deviceCUDA->getMasterPe() == CkMyPe();
1864 
1865  if (isMasterPe) {
1866  const int deviceIndex = deviceCUDA->getDeviceIndex();
1867  const int numPatchesHome = patchData->devData[deviceIndex].numPatchesHome;
1868  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1869 
1870  CmiLock(patchData->printlock);
1871  CkPrintf("PE: %d\n", CkMyPe());
1872 
1873  CkPrintf("[%d] Home patches %d Local patches %d\n", CkMyPe(), numPatchesHome, localPatches.size());
1874 
1875  CkPrintf("Home Patches: ");
1876  for (int i = 0; i < numPatchesHome; i++) {
1877  CkPrintf("%d ", localPatches[i].patchID);
1878  }
1879  CkPrintf("\n");
1880 
1881  CkPrintf("Proxy Patches: ");
1882  for (int i = numPatchesHome; i < localPatches.size(); i++) {
1883  CkPrintf("%d ", localPatches[i].patchID);
1884  }
1885  CkPrintf("\n");
1886 
1887  CmiUnlock(patchData->printlock);
1888  }
1889  CmiNodeBarrier();
1890 }
1891 
1893  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1894  patchData = cpdata.ckLocalBranch();
1895 
1896  const bool isMasterPe = deviceCUDA->getMasterPe() == CkMyPe();
1897 
1898  // clearDevicePatchMap should only be called once per PE
1899  if (!patchData->devicePatchMapFlag[CkMyPe()]) return;
1900  patchData->devicePatchMapFlag[CkMyPe()] = 0;
1901 
1902  // One thread per GPU will execute this block
1903  if (isMasterPe) {
1904  const int deviceIndex = deviceCUDA->getDeviceIndex();
1905 
1906  using NBPatchRecord = CudaComputeNonbonded::PatchRecord;
1907  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1908  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1909  std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1910 
1911  homePatches.clear();
1912  localPatches.clear();
1913  peerPatches.clear();
1915  }
1916 }
1917 
1918 /*
1919  * Updates the meta data structures storing the patch data for GPU resident code path
1920  *
1921  * This is called every migration step. The actual mapping stays the same,
1922  * but the atom counts per patch change
1923  *
1924  */
1925 void Sequencer::updateDevicePatchMap(int startup) {
1926  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1927  patchData = cpdata.ckLocalBranch();
1928 
1929  const bool isMasterPe = deviceCUDA->getMasterPe() == CkMyPe();
1930 
1931  if (isMasterPe) {
1932  const int deviceIndex = deviceCUDA->getDeviceIndex();
1933  const int numPatchesHome = patchData->devData[deviceIndex].numPatchesHome;
1934  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1937  CudaComputeNonbonded* cudaNbond = cudaMgr->getCudaComputeNonbonded();
1938 
1939  int max_atom_count = 0;
1940  int total_atom_count = 0;
1941 
1942  // Update the atom count of home patches
1943  for (int i = 0; i < numPatchesHome; i++) {
1944  Patch* patch = NULL;
1945  for(int j = 0; j < deviceCUDA->getNumPesSharingDevice(); j++){
1947  patch = pm->patch(localPatches[i].patchID);
1948  if (patch != NULL) break;
1949  }
1950  if (patch == NULL) NAMD_die("Sequencer: Failed to find patch in updateDevicePatchMap");
1951 
1952  localPatches[i].numAtoms = patch->getNumAtoms();
1953  localPatches[i].numAtomsNBPad = CudaComputeNonbondedKernel::computeAtomPad(localPatches[i].numAtoms);
1954 
1955  if (localPatches[i].numAtoms > max_atom_count) max_atom_count = localPatches[i].numAtoms;
1956  total_atom_count += localPatches[i].numAtoms;
1957  }
1958  }
1959  CmiNodeBarrier();
1960 
1961  // Update the proxy patches next, using the home patch atom counts of other devices
1962  if (isMasterPe) {
1963  const int deviceIndex = deviceCUDA->getDeviceIndex();
1964  const int numPatchesHome = patchData->devData[deviceIndex].numPatchesHome;
1965  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1966  std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1967  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1968 
1969  for (int i = numPatchesHome; i < numPatchesHomeAndProxy; i++) {
1970  const int index = localPatches[i].peerRecordStartIndex;
1971  const int devIdx = peerPatches[index].deviceIndex;
1972  const int peerIdx = peerPatches[index].patchIndex;
1973  const CudaLocalRecord peer = patchData->devData[devIdx].h_localPatches[peerIdx];
1974 
1975  localPatches[i].numAtoms = peer.numAtoms;
1976  localPatches[i].numAtomsNBPad = peer.numAtomsNBPad;
1977  }
1978  }
1979  CmiNodeBarrier();
1980 
1981  // Computes the offset for each patch using the atom counts
1982  if (isMasterPe) {
1983  const int deviceIndex = deviceCUDA->getDeviceIndex();
1984  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1985  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1986 
1987  int runningOffset = 0;
1988  int runningOffsetNBPad = 0;
1989  // TODO Change to a C++ prefix sum
1990  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1991  localPatches[i].bufferOffset = runningOffset;
1992  localPatches[i].bufferOffsetNBPad = runningOffsetNBPad;
1993  runningOffset += localPatches[i].numAtoms;
1994  runningOffsetNBPad += localPatches[i].numAtomsNBPad;
1995  }
1996  }
1997  CmiNodeBarrier();
1998 
1999  // Update the peer records using the local record data
2000  if (isMasterPe) {
2001  const int deviceIndex = deviceCUDA->getDeviceIndex();
2002  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
2003  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
2004  std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
2005 
2006 
2007  for (int i = 0; i < peerPatches.size(); i++) {
2008  const int devIdx = peerPatches[i].deviceIndex;
2009  const int peerIdx = peerPatches[i].patchIndex;
2010  const CudaLocalRecord peer = patchData->devData[devIdx].h_localPatches[peerIdx];
2011 
2012  peerPatches[i].bufferOffset = peer.bufferOffset;
2013  peerPatches[i].bufferOffsetNBPad = peer.bufferOffsetNBPad;
2014  }
2015 
2016  // Update inline copy of peer data
2017  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
2018  const int numPeerRecord = localPatches[i].numPeerRecord;
2019  const int peerOffset = localPatches[i].peerRecordStartIndex;
2020 
2021  for (int j = 0; j < std::min(numPeerRecord, CudaLocalRecord::num_inline_peer); j++) {
2022  localPatches[i].inline_peers[j] = peerPatches[peerOffset+j];
2023  }
2024  }
2025  }
2026  CmiNodeBarrier();
2027 }
2028 
2029 #endif
2030 
2031 
2032 void Sequencer::integrate_SOA(int scriptTask) {
2033  //
2034  // Below when accessing the array buffers for position, velocity, force,
2035  // note that we don't want to set up pointers directly to the buffers
2036  // because the allocations might get resized after atom migration.
2037  //
2038 
2039 #ifdef TIMER_COLLECTION
2040  TimerSet& t = patch->timerSet;
2041 #endif
2042  TIMER_INIT_WIDTH(t, KICK, simParams->timerBinWidth);
2043  TIMER_INIT_WIDTH(t, MAXMOVE, simParams->timerBinWidth);
2044  TIMER_INIT_WIDTH(t, DRIFT, simParams->timerBinWidth);
2045  TIMER_INIT_WIDTH(t, PISTON, simParams->timerBinWidth);
2046  TIMER_INIT_WIDTH(t, SUBMITHALF, simParams->timerBinWidth);
2047  TIMER_INIT_WIDTH(t, VELBBK1, simParams->timerBinWidth);
2048  TIMER_INIT_WIDTH(t, VELBBK2, simParams->timerBinWidth);
2049  TIMER_INIT_WIDTH(t, RATTLE1, simParams->timerBinWidth);
2050  TIMER_INIT_WIDTH(t, SUBMITFULL, simParams->timerBinWidth);
2051  TIMER_INIT_WIDTH(t, SUBMITCOLLECT, simParams->timerBinWidth);
2052 
2053  // Keep track of the step number.
2054  int &step = patch->flags.step;
2055  step = simParams->firstTimestep;
2056 
2057  // For multiple time stepping, which force boxes are used?
2058  int &maxForceUsed = patch->flags.maxForceUsed;
2059  int &maxForceMerged = patch->flags.maxForceMerged;
2060  maxForceUsed = Results::normal;
2061  maxForceMerged = Results::normal;
2062 
2063  // Keep track of total steps and steps per cycle.
2064  const int numberOfSteps = simParams->N;
2065  //const int stepsPerCycle = simParams->stepsPerCycle;
2066  CheckStep stepsPerCycle;
2067  stepsPerCycle.init(step, simParams->stepsPerCycle);
2068  // The fundamental time step, get the scaling right for velocity units.
2069  const BigReal timestep = simParams->dt * RECIP_TIMEFACTOR;
2070 
2071  //const int nonbondedFrequency = simParams->nonbondedFrequency;
2072  //slowFreq = nonbondedFrequency;
2073  CheckStep nonbondedFrequency;
2075  // The step size for short-range nonbonded forces.
2076  const BigReal nbondstep = timestep * simParams->nonbondedFrequency;
2077  int &doNonbonded = patch->flags.doNonbonded;
2078  //doNonbonded = (step >= numberOfSteps) || !(step%nonbondedFrequency);
2079  doNonbonded = (step >= numberOfSteps) ||
2080  nonbondedFrequency.init(step, simParams->nonbondedFrequency);
2081  //if ( nonbondedFrequency == 1 ) maxForceMerged = Results::nbond;
2082  if ( nonbondedFrequency.period == 1 ) maxForceMerged = Results::nbond;
2083  if ( doNonbonded ) maxForceUsed = Results::nbond;
2084 
2085  // Do we do full electrostatics?
2086  const int dofull = ( simParams->fullElectFrequency ? 1 : 0 );
2087  //const int fullElectFrequency = simParams->fullElectFrequency;
2088  //if ( dofull ) slowFreq = fullElectFrequency;
2089  CheckStep fullElectFrequency;
2090  if ( dofull ) slowFreq = simParams->fullElectFrequency;
2091  // The step size for long-range electrostatics.
2092  const BigReal slowstep = timestep * simParams->fullElectFrequency;
2093  int &doFullElectrostatics = patch->flags.doFullElectrostatics;
2094  //doFullElectrostatics = (dofull &&
2095  // ((step >= numberOfSteps) || !(step%fullElectFrequency)));
2096  doFullElectrostatics = (dofull &&
2097  ((step >= numberOfSteps) ||
2098  fullElectFrequency.init(step, simParams->fullElectFrequency)));
2099  //if ( dofull && fullElectFrequency == 1 ) maxForceMerged = Results::slow;
2100  if ( dofull && fullElectFrequency.period == 1 ) maxForceMerged = Results::slow;
2101  if ( doFullElectrostatics ) maxForceUsed = Results::slow;
2102 
2103  // Bother to calculate energies?
2104  int &doEnergy = patch->flags.doEnergy;
2105  //int energyFrequency = simParams->outputEnergies;
2106  CheckStep energyFrequency;
2107  int newComputeEnergies = simParams->computeEnergies;
2108  if(simParams->alchOn) newComputeEnergies = NAMD_gcd(newComputeEnergies, simParams->alchOutFreq);
2109  doEnergy = energyFrequency.init(step, newComputeEnergies);
2110 
2111  // Do we need to return forces to TCL script or Colvar module?
2112  int doTcl = simParams->tclForcesOn;
2113  int doColvars = simParams->colvarsOn;
2114  int doGlobal = doTcl || doColvars;
2116  int &doVirial = patch->flags.doVirial;
2117  doVirial = 1;
2118 
2119  // The following flags have to be explicitly disabled in Patch object.
2120  patch->flags.doMolly = 0;
2121  patch->flags.doLoweAndersen = 0;
2122  patch->flags.doGBIS = 0;
2123  patch->flags.doLCPO = 0;
2124 
2125  // Square of maximum velocity for simulation safety check
2126  const BigReal maxvel2 =
2127  (simParams->cutoff * simParams->cutoff) / (timestep * timestep);
2128 
2129  // check for Langevin piston
2130  // set period beyond numberOfSteps to disable
2131  CheckStep langevinPistonFrequency;
2132  langevinPistonFrequency.init(step,
2133  (simParams->langevinPistonOn ? slowFreq : numberOfSteps+1 ),
2134  (simParams->langevinPistonOn ? -1-slowFreq/2 : 0) /* = delta */);
2135 
2136  // check for output
2137  // set period beyond numberOfSteps to disable
2138  CheckStep restartFrequency;
2139  restartFrequency.init(step, (simParams->restartFrequency ?
2140  simParams->restartFrequency : numberOfSteps+1) );
2141  CheckStep dcdFrequency;
2142  dcdFrequency.init(step, (simParams->dcdFrequency ?
2143  simParams->dcdFrequency : numberOfSteps+1) );
2144  CheckStep velDcdFrequency;
2145  velDcdFrequency.init(step, (simParams->velDcdFrequency ?
2146  simParams->velDcdFrequency : numberOfSteps+1) );
2147  CheckStep forceDcdFrequency;
2148  forceDcdFrequency.init(step, (simParams->forceDcdFrequency ?
2149  simParams->forceDcdFrequency : numberOfSteps+1) );
2150  CheckStep imdFrequency;
2151  imdFrequency.init(step, (simParams->IMDfreq ?
2152  simParams->IMDfreq : numberOfSteps+1) );
2153 
2154  if ( scriptTask == SCRIPT_RUN ) {
2155  // enforce rigid bond constraints on initial positions
2156  TIMER_START(t, RATTLE1);
2157  rattle1_SOA(0., 0);
2158  TIMER_STOP(t, RATTLE1);
2159 
2160  // must migrate here!
2161  int natoms = patch->patchDataSOA.numAtoms;
2162  runComputeObjects_SOA(1, step<numberOfSteps, step);
2163  // kick -0.5
2164  TIMER_START(t, KICK);
2165  addForceToMomentum_SOA(-0.5, timestep, nbondstep, slowstep,
2166 #ifndef SOA_SIMPLIFY_PARAMS
2167  patch->patchDataSOA.recipMass,
2168  patch->patchDataSOA.f_normal_x,
2169  patch->patchDataSOA.f_normal_y,
2170  patch->patchDataSOA.f_normal_z,
2171  patch->patchDataSOA.f_nbond_x,
2172  patch->patchDataSOA.f_nbond_y,
2173  patch->patchDataSOA.f_nbond_z,
2174  patch->patchDataSOA.f_slow_x,
2175  patch->patchDataSOA.f_slow_y,
2176  patch->patchDataSOA.f_slow_z,
2177  patch->patchDataSOA.vel_x,
2178  patch->patchDataSOA.vel_y,
2179  patch->patchDataSOA.vel_z,
2180  patch->patchDataSOA.numAtoms,
2181 #endif
2182  maxForceUsed
2183  );
2184  TIMER_STOP(t, KICK);
2185 
2186  TIMER_START(t, RATTLE1);
2187  rattle1_SOA(-timestep, 0);
2188  TIMER_STOP(t, RATTLE1);
2189 
2190  TIMER_START(t, SUBMITHALF);
2192 #ifndef SOA_SIMPLIFY_PARAMS
2193  patch->patchDataSOA.hydrogenGroupSize,
2194  patch->patchDataSOA.mass,
2195  patch->patchDataSOA.vel_x,
2196  patch->patchDataSOA.vel_y,
2197  patch->patchDataSOA.vel_z,
2198  patch->patchDataSOA.numAtoms
2199 #endif
2200  );
2201  TIMER_STOP(t, SUBMITHALF);
2202 
2203  // kick 1.0
2204  TIMER_START(t, KICK);
2205  addForceToMomentum_SOA(1.0, timestep, nbondstep, slowstep,
2206 #ifndef SOA_SIMPLIFY_PARAMS
2207  patch->patchDataSOA.recipMass,
2208  patch->patchDataSOA.f_normal_x,
2209  patch->patchDataSOA.f_normal_y,
2210  patch->patchDataSOA.f_normal_z,
2211  patch->patchDataSOA.f_nbond_x,
2212  patch->patchDataSOA.f_nbond_y,
2213  patch->patchDataSOA.f_nbond_z,
2214  patch->patchDataSOA.f_slow_x,
2215  patch->patchDataSOA.f_slow_y,
2216  patch->patchDataSOA.f_slow_z,
2217  patch->patchDataSOA.vel_x,
2218  patch->patchDataSOA.vel_y,
2219  patch->patchDataSOA.vel_z,
2220  patch->patchDataSOA.numAtoms,
2221 #endif
2222  maxForceUsed
2223  );
2224  TIMER_STOP(t, KICK);
2225 
2226  TIMER_START(t, RATTLE1);
2227  rattle1_SOA(timestep, 1);
2228  TIMER_STOP(t, RATTLE1);
2229 
2230  // save total force in computeGlobal
2231  if (doGlobal) {
2232  computeGlobal->saveTotalForces(patch);
2233  }
2234 
2235  TIMER_START(t, SUBMITHALF);
2237 #ifndef SOA_SIMPLIFY_PARAMS
2238  patch->patchDataSOA.hydrogenGroupSize,
2239  patch->patchDataSOA.mass,
2240  patch->patchDataSOA.vel_x,
2241  patch->patchDataSOA.vel_y,
2242  patch->patchDataSOA.vel_z,
2243  patch->patchDataSOA.numAtoms
2244 #endif
2245  );
2246  TIMER_STOP(t, SUBMITHALF);
2247 
2248  // kick -0.5
2249  TIMER_START(t, KICK);
2250  addForceToMomentum_SOA(-0.5, timestep, nbondstep, slowstep,
2251 #ifndef SOA_SIMPLIFY_PARAMS
2252  patch->patchDataSOA.recipMass,
2253  patch->patchDataSOA.f_normal_x,
2254  patch->patchDataSOA.f_normal_y,
2255  patch->patchDataSOA.f_normal_z,
2256  patch->patchDataSOA.f_nbond_x,
2257  patch->patchDataSOA.f_nbond_y,
2258  patch->patchDataSOA.f_nbond_z,
2259  patch->patchDataSOA.f_slow_x,
2260  patch->patchDataSOA.f_slow_y,
2261  patch->patchDataSOA.f_slow_z,
2262  patch->patchDataSOA.vel_x,
2263  patch->patchDataSOA.vel_y,
2264  patch->patchDataSOA.vel_z,
2265  patch->patchDataSOA.numAtoms,
2266 #endif
2267  maxForceUsed
2268  );
2269  TIMER_STOP(t, KICK);
2270 
2271  TIMER_START(t, SUBMITFULL);
2273 #ifndef SOA_SIMPLIFY_PARAMS
2274  patch->patchDataSOA.hydrogenGroupSize,
2275  patch->patchDataSOA.mass,
2276  patch->patchDataSOA.pos_x,
2277  patch->patchDataSOA.pos_y,
2278  patch->patchDataSOA.pos_z,
2279  patch->patchDataSOA.vel_x,
2280  patch->patchDataSOA.vel_y,
2281  patch->patchDataSOA.vel_z,
2282  patch->patchDataSOA.f_normal_x,
2283  patch->patchDataSOA.f_normal_y,
2284  patch->patchDataSOA.f_normal_z,
2285  patch->patchDataSOA.f_nbond_x,
2286  patch->patchDataSOA.f_nbond_y,
2287  patch->patchDataSOA.f_nbond_z,
2288  patch->patchDataSOA.f_slow_x,
2289  patch->patchDataSOA.f_slow_y,
2290  patch->patchDataSOA.f_slow_z,
2291  patch->patchDataSOA.numAtoms
2292 #endif
2293  );
2294  TIMER_STOP(t, SUBMITFULL);
2295 
2296  rebalanceLoad(step);
2297  } // scriptTask == SCRIPT_RUN
2298 
2299 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
2300  int& eon = patch->flags.event_on;
2301  int epid = (simParams->beginEventPatchID <= patch->getPatchID()
2302  && patch->getPatchID() <= simParams->endEventPatchID);
2303  int beginStep = simParams->beginEventStep;
2304  int endStep = simParams->endEventStep;
2305  bool controlProfiling = patch->getPatchID() == 0;
2306 #endif
2307 
2308  for ( ++step; step <= numberOfSteps; ++step ) {
2309  int dcdSelectionChecks=0;
2310  Molecule *molecule = Node::Object()->molecule;
2311  for(int dcdindex=0; dcdindex<16;++dcdindex)
2312  {
2313  int dcdSelectionFrequency = molecule->dcdSelectionParams[dcdindex].frequency;
2314  if(dcdSelectionFrequency && step % dcdSelectionFrequency)
2315  dcdSelectionChecks++;
2316  }
2317  const int isCollection = restartFrequency.check(step) +
2318  dcdFrequency.check(step) + velDcdFrequency.check(step) +
2319  forceDcdFrequency.check(step) + imdFrequency.check(step) +
2320  dcdSelectionChecks;
2321  const int isMigration = stepsPerCycle.check(step);
2322  doEnergy = energyFrequency.check(step);
2323  DebugM(3,"doGlobal now "<< doGlobal<<"\n"<<endi);
2324 
2325 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
2326  eon = epid && (beginStep < step && step <= endStep);
2327 
2328  if (controlProfiling && step == beginStep) {
2330  }
2331  if (controlProfiling && step == endStep) {
2333  }
2334 // NAMD_EVENT_START(eon, NamdProfileEvent::INTEGRATE_SOA_1);
2335  char buf[32];
2336  sprintf(buf, "%s: %d", NamdProfileEventStr[NamdProfileEvent::INTEGRATE_SOA_1], patch->getPatchID());
2337  NAMD_EVENT_START_EX(eon, NamdProfileEvent::INTEGRATE_SOA_1, buf);
2338 #endif
2339 
2340  if ( simParams->stochRescaleOn ) {
2342  }
2343 
2344  if ( simParams->berendsenPressureOn ) {
2346 #ifndef SOA_SIMPLIFY_PARAMS
2347  patch->patchDataSOA.hydrogenGroupSize,
2348  patch->patchDataSOA.mass,
2349  patch->patchDataSOA.pos_x,
2350  patch->patchDataSOA.pos_y,
2351  patch->patchDataSOA.pos_z,
2352  patch->patchDataSOA.numAtoms,
2353 #endif
2354  step);
2355  }
2356 
2357  // kick 0.5
2358  TIMER_START(t, KICK);
2359  addForceToMomentum_SOA(0.5, timestep, nbondstep, slowstep,
2360 #ifndef SOA_SIMPLIFY_PARAMS
2361  patch->patchDataSOA.recipMass,
2362  patch->patchDataSOA.f_normal_x,
2363  patch->patchDataSOA.f_normal_y,
2364  patch->patchDataSOA.f_normal_z,
2365  patch->patchDataSOA.f_nbond_x,
2366  patch->patchDataSOA.f_nbond_y,
2367  patch->patchDataSOA.f_nbond_z,
2368  patch->patchDataSOA.f_slow_x,
2369  patch->patchDataSOA.f_slow_y,
2370  patch->patchDataSOA.f_slow_z,
2371  patch->patchDataSOA.vel_x,
2372  patch->patchDataSOA.vel_y,
2373  patch->patchDataSOA.vel_z,
2374  patch->patchDataSOA.numAtoms,
2375 #endif
2376  maxForceUsed
2377  );
2378  TIMER_STOP(t, KICK);
2379 
2380  // maximumMove checks velocity bound on atoms
2381  TIMER_START(t, MAXMOVE);
2382  maximumMove_SOA(timestep, maxvel2
2383 #ifndef SOA_SIMPLIFY_PARAMS
2384  ,
2385  patch->patchDataSOA.vel_x,
2386  patch->patchDataSOA.vel_y,
2387  patch->patchDataSOA.vel_z,
2388  patch->patchDataSOA.numAtoms
2389 #endif
2390  );
2391  TIMER_STOP(t, MAXMOVE);
2392 
2393 
2394  NAMD_EVENT_STOP(eon, NamdProfileEvent::INTEGRATE_SOA_1);
2395 
2396  // Check to see if Langevin piston is enabled this step:
2397  // ! ((step-1-slowFreq/2) % slowFreq)
2398  if ( langevinPistonFrequency.check(step) ) {
2399  // if (langevinPistonStep) {
2400  // drift 0.5
2401  TIMER_START(t, DRIFT);
2402  addVelocityToPosition_SOA(0.5*timestep
2403 #ifndef SOA_SIMPLIFY_PARAMS
2404  ,
2405  patch->patchDataSOA.vel_x,
2406  patch->patchDataSOA.vel_y,
2407  patch->patchDataSOA.vel_z,
2408  patch->patchDataSOA.pos_x,
2409  patch->patchDataSOA.pos_y,
2410  patch->patchDataSOA.pos_z,
2411  patch->patchDataSOA.numAtoms
2412 #endif
2413  );
2414  TIMER_STOP(t, DRIFT);
2415  // There is a blocking receive inside of langevinPiston()
2416  // that might suspend the current thread of execution,
2417  // so split profiling around this conditional block.
2419 #ifndef SOA_SIMPLIFY_PARAMS
2420  patch->patchDataSOA.hydrogenGroupSize,
2421  patch->patchDataSOA.mass,
2422  patch->patchDataSOA.pos_x,
2423  patch->patchDataSOA.pos_y,
2424  patch->patchDataSOA.pos_z,
2425  patch->patchDataSOA.vel_x,
2426  patch->patchDataSOA.vel_y,
2427  patch->patchDataSOA.vel_z,
2428  patch->patchDataSOA.numAtoms,
2429 #endif
2430  step
2431  );
2432 
2433  // drift 0.5
2434  TIMER_START(t, DRIFT);
2435  addVelocityToPosition_SOA(0.5*timestep
2436 #ifndef SOA_SIMPLIFY_PARAMS
2437  ,
2438  patch->patchDataSOA.vel_x,
2439  patch->patchDataSOA.vel_y,
2440  patch->patchDataSOA.vel_z,
2441  patch->patchDataSOA.pos_x,
2442  patch->patchDataSOA.pos_y,
2443  patch->patchDataSOA.pos_z,
2444  patch->patchDataSOA.numAtoms
2445 #endif
2446  );
2447  TIMER_STOP(t, DRIFT);
2448  }
2449  else {
2450  // drift 1.0
2451  TIMER_START(t, DRIFT);
2452  addVelocityToPosition_SOA(timestep
2453 #ifndef SOA_SIMPLIFY_PARAMS
2454  ,
2455  patch->patchDataSOA.vel_x,
2456  patch->patchDataSOA.vel_y,
2457  patch->patchDataSOA.vel_z,
2458  patch->patchDataSOA.pos_x,
2459  patch->patchDataSOA.pos_y,
2460  patch->patchDataSOA.pos_z,
2461  patch->patchDataSOA.numAtoms
2462 #endif
2463  );
2464  TIMER_STOP(t, DRIFT);
2465  }
2466 
2467  //NAMD_EVENT_START(eon, NamdProfileEvent::INTEGRATE_SOA_2);
2468 
2469  // There are NO sends in submitHalfstep() just local summation
2470  // into the Reduction struct.
2471  TIMER_START(t, SUBMITHALF);
2473 #ifndef SOA_SIMPLIFY_PARAMS
2474  patch->patchDataSOA.hydrogenGroupSize,
2475  patch->patchDataSOA.mass,
2476  patch->patchDataSOA.vel_x,
2477  patch->patchDataSOA.vel_y,
2478  patch->patchDataSOA.vel_z,
2479  patch->patchDataSOA.numAtoms
2480 #endif
2481  );
2482  TIMER_STOP(t, SUBMITHALF);
2483 
2484  //doNonbonded = !(step%nonbondedFrequency);
2485  doNonbonded = nonbondedFrequency.check(step);
2486  //doFullElectrostatics = (dofull && !(step%fullElectFrequency));
2487  doFullElectrostatics = (dofull && fullElectFrequency.check(step));
2488 
2489  maxForceUsed = Results::normal;
2490  if ( doNonbonded ) maxForceUsed = Results::nbond;
2491  if ( doFullElectrostatics ) maxForceUsed = Results::slow;
2492 
2493  // Migrate Atoms on stepsPerCycle
2494  // Check to see if this is energy evaluation step:
2495  // doEnergy = ! ( step % energyFrequency );
2496  doVirial = 1;
2497  doKineticEnergy = 1;
2498  doMomenta = 1;
2499 
2500  //NAMD_EVENT_STOP(eon, NamdProfileEvent::INTEGRATE_SOA_2); // integrate_SOA 2
2501 
2502  // The current thread of execution will suspend in runComputeObjects().
2503  // Check to see if we are at a migration step:
2504  // runComputeObjects_SOA(!(step%stepsPerCycle), step<numberOfSteps);
2505  runComputeObjects_SOA(isMigration, step<numberOfSteps, step);
2506 
2507  NAMD_EVENT_START(eon, NamdProfileEvent::INTEGRATE_SOA_3);
2508 
2509  TIMER_START(t, VELBBK1);
2511  timestep
2512 #ifndef SOA_SIMPLIFY_PARAMS
2513  ,
2514  patch->patchDataSOA.langevinParam,
2515  patch->patchDataSOA.vel_x,
2516  patch->patchDataSOA.vel_y,
2517  patch->patchDataSOA.vel_z,
2518  patch->patchDataSOA.numAtoms
2519 #endif
2520  );
2521  TIMER_STOP(t, VELBBK1);
2522 
2523  // kick 1.0
2524  TIMER_START(t, KICK);
2525  addForceToMomentum_SOA(1.0, timestep, nbondstep, slowstep,
2526 #ifndef SOA_SIMPLIFY_PARAMS
2527  patch->patchDataSOA.recipMass,
2528  patch->patchDataSOA.f_normal_x,
2529  patch->patchDataSOA.f_normal_y,
2530  patch->patchDataSOA.f_normal_z,
2531  patch->patchDataSOA.f_nbond_x,
2532  patch->patchDataSOA.f_nbond_y,
2533  patch->patchDataSOA.f_nbond_z,
2534  patch->patchDataSOA.f_slow_x,
2535  patch->patchDataSOA.f_slow_y,
2536  patch->patchDataSOA.f_slow_z,
2537  patch->patchDataSOA.vel_x,
2538  patch->patchDataSOA.vel_y,
2539  patch->patchDataSOA.vel_z,
2540  patch->patchDataSOA.numAtoms,
2541 #endif
2542  maxForceUsed
2543  );
2544  TIMER_STOP(t, KICK);
2545 
2546  TIMER_START(t, VELBBK2);
2548  timestep
2549 #ifndef SOA_SIMPLIFY_PARAMS
2550  ,
2551  patch->patchDataSOA.langevinParam,
2552  patch->patchDataSOA.langScalVelBBK2,
2553  patch->patchDataSOA.langScalRandBBK2,
2554  patch->patchDataSOA.gaussrand_x,
2555  patch->patchDataSOA.gaussrand_y,
2556  patch->patchDataSOA.gaussrand_z,
2557  patch->patchDataSOA.vel_x,
2558  patch->patchDataSOA.vel_y,
2559  patch->patchDataSOA.vel_z,
2560  patch->patchDataSOA.numAtoms
2561 #endif
2562  );
2563  TIMER_STOP(t, VELBBK2);
2564 
2565  TIMER_START(t, RATTLE1);
2566  rattle1_SOA(timestep, 1);
2567  TIMER_STOP(t, RATTLE1);
2568 
2569  // save total force in computeGlobal
2570  if (doGlobal) {
2571  computeGlobal->saveTotalForces(patch);
2572  }
2573 
2574  TIMER_START(t, SUBMITHALF);
2576 #ifndef SOA_SIMPLIFY_PARAMS
2577  patch->patchDataSOA.hydrogenGroupSize,
2578  patch->patchDataSOA.mass,
2579  patch->patchDataSOA.vel_x,
2580  patch->patchDataSOA.vel_y,
2581  patch->patchDataSOA.vel_z,
2582  patch->patchDataSOA.numAtoms
2583 #endif
2584  );
2585  TIMER_STOP(t, SUBMITHALF);
2586 
2587  // kick -0.5
2588  TIMER_START(t, KICK);
2589  addForceToMomentum_SOA(-0.5, timestep, nbondstep, slowstep,
2590 #ifndef SOA_SIMPLIFY_PARAMS
2591  patch->patchDataSOA.recipMass,
2592  patch->patchDataSOA.f_normal_x,
2593  patch->patchDataSOA.f_normal_y,
2594  patch->patchDataSOA.f_normal_z,
2595  patch->patchDataSOA.f_nbond_x,
2596  patch->patchDataSOA.f_nbond_y,
2597  patch->patchDataSOA.f_nbond_z,
2598  patch->patchDataSOA.f_slow_x,
2599  patch->patchDataSOA.f_slow_y,
2600  patch->patchDataSOA.f_slow_z,
2601  patch->patchDataSOA.vel_x,
2602  patch->patchDataSOA.vel_y,
2603  patch->patchDataSOA.vel_z,
2604  patch->patchDataSOA.numAtoms,
2605 #endif
2606  maxForceUsed
2607  );
2608  TIMER_STOP(t, KICK);
2609 
2610  // XXX rattle2_SOA(timestep,step);
2611 
2612  TIMER_START(t, SUBMITFULL);
2614 #ifndef SOA_SIMPLIFY_PARAMS
2615  patch->patchDataSOA.hydrogenGroupSize,
2616  patch->patchDataSOA.mass,
2617  patch->patchDataSOA.pos_x,
2618  patch->patchDataSOA.pos_y,
2619  patch->patchDataSOA.pos_z,
2620  patch->patchDataSOA.vel_x,
2621  patch->patchDataSOA.vel_y,
2622  patch->patchDataSOA.vel_z,
2623  patch->patchDataSOA.f_normal_x,
2624  patch->patchDataSOA.f_normal_y,
2625  patch->patchDataSOA.f_normal_z,
2626  patch->patchDataSOA.f_nbond_x,
2627  patch->patchDataSOA.f_nbond_y,
2628  patch->patchDataSOA.f_nbond_z,
2629  patch->patchDataSOA.f_slow_x,
2630  patch->patchDataSOA.f_slow_y,
2631  patch->patchDataSOA.f_slow_z,
2632  patch->patchDataSOA.numAtoms
2633 #endif
2634  );
2635  TIMER_STOP(t, SUBMITFULL);
2636 #ifdef TESTPID
2637  if (1) {
2638  int pid = TESTPID;
2639  if (patch->patchID == pid) {
2640  const PatchDataSOA& p = patch->patchDataSOA;
2641  int n = p.numAtoms;
2642 #if 0
2643  fprintf(stderr, "Patch %d has %d atoms\n", pid, n);
2644  fprintf(stderr, "%3s %8s %12s %12s %12s\n",
2645  "", "id", "fnormal_x", "fnbond_x", "fslow_x");
2646  for (int i=0; i < n; i++) {
2647  int index = p.id[i];
2648  fprintf(stderr, "%3d %8d %12.8f %12.8f %12.8f\n",
2649  i, index, p.f_normal_x[i], p.f_nbond_x[i], p.f_slow_x[i]);
2650  }
2651 #else
2652  Vector *f_normal = new Vector[n];
2653  Vector *f_nbond = new Vector[n];
2654  Vector *f_slow = new Vector[n];
2655  for (int i=0; i < n; i++) {
2656  f_normal[i].x = p.f_normal_x[i];
2657  f_normal[i].y = p.f_normal_y[i];
2658  f_normal[i].z = p.f_normal_z[i];
2659  f_nbond[i].x = p.f_nbond_x[i];
2660  f_nbond[i].y = p.f_nbond_y[i];
2661  f_nbond[i].z = p.f_nbond_z[i];
2662  f_slow[i].x = p.f_slow_x[i];
2663  f_slow[i].y = p.f_slow_y[i];
2664  f_slow[i].z = p.f_slow_z[i];
2665  }
2666  TestArray_write<double>(
2667  "f_normal_good.bin", "f_normal good", (double*)f_normal, 3*n);
2668  TestArray_write<double>(
2669  "f_nbond_good.bin", "f_nbond good", (double*)f_nbond, 3*n);
2670  TestArray_write<double>(
2671  "f_slow_good.bin", "f_slow good", (double*)f_slow, 3*n);
2672  delete [] f_normal;
2673  delete [] f_nbond;
2674  delete [] f_slow;
2675 #endif
2676  }
2677  }
2678 #endif
2679 
2680  // Do collections if any checks below are "on."
2681  // We add because we can't short-circuit.
2682  TIMER_START(t, SUBMITCOLLECT);
2683  if (isCollection) {
2684  submitCollections_SOA(step);
2685  }
2686  TIMER_STOP(t, SUBMITCOLLECT);
2687 
2688  NAMD_EVENT_STOP(eon, NamdProfileEvent::INTEGRATE_SOA_3); // integrate_SOA 3
2689 
2690  rebalanceLoad(step);
2691  }
2692 
2693  patch->copy_updates_to_AOS();
2694 
2695  TIMER_DONE(t);
2696  if (patch->patchID == SPECIAL_PATCH_ID) {
2697  printf("Timer collection reporting in microseconds for "
2698  "Patch %d\n", patch->patchID);
2699  TIMER_REPORT(t);
2700  }
2701 }
2702 
2703 
2704 // XXX inline it?
2705 // XXX does not handle fixed atoms
2706 // Each timestep: dt = scaling * (timestep / TIMEFACTOR);
2708  const double scaling,
2709  double dt_normal, // timestep Results::normal = 0
2710  double dt_nbond, // timestep Results::nbond = 1
2711  double dt_slow, // timestep Results::slow = 2
2712 #ifndef SOA_SIMPLIFY_PARAMS
2713  const double * __restrict recipMass,
2714  const double * __restrict f_normal_x, // force Results::normal = 0
2715  const double * __restrict f_normal_y,
2716  const double * __restrict f_normal_z,
2717  const double * __restrict f_nbond_x, // force Results::nbond = 1
2718  const double * __restrict f_nbond_y,
2719  const double * __restrict f_nbond_z,
2720  const double * __restrict f_slow_x, // force Results::slow = 2
2721  const double * __restrict f_slow_y,
2722  const double * __restrict f_slow_z,
2723  double * __restrict vel_x,
2724  double * __restrict vel_y,
2725  double * __restrict vel_z,
2726  int numAtoms,
2727 #endif
2728  int maxForceNumber
2729  ) {
2730  NAMD_EVENT_RANGE_2(patch->flags.event_on,
2731  NamdProfileEvent::ADD_FORCE_TO_MOMENTUM_SOA);
2732 
2733 #ifdef SOA_SIMPLIFY_PARAMS
2734  const double * __restrict recipMass = patch->patchDataSOA.recipMass;
2735  // force Results::normal = 0
2736  const double * __restrict f_normal_x = patch->patchDataSOA.f_normal_x;
2737  const double * __restrict f_normal_y = patch->patchDataSOA.f_normal_y;
2738  const double * __restrict f_normal_z = patch->patchDataSOA.f_normal_z;
2739  // force Results::nbond = 1
2740  const double * __restrict f_nbond_x = patch->patchDataSOA.f_nbond_x;
2741  const double * __restrict f_nbond_y = patch->patchDataSOA.f_nbond_y;
2742  const double * __restrict f_nbond_z = patch->patchDataSOA.f_nbond_z;
2743  // force Results::slow = 2
2744  const double * __restrict f_slow_x = patch->patchDataSOA.f_slow_x;
2745  const double * __restrict f_slow_y = patch->patchDataSOA.f_slow_y;
2746  const double * __restrict f_slow_z = patch->patchDataSOA.f_slow_z;
2747  double * __restrict vel_x = patch->patchDataSOA.vel_x;
2748  double * __restrict vel_y = patch->patchDataSOA.vel_y;
2749  double * __restrict vel_z = patch->patchDataSOA.vel_z;
2750  int numAtoms = patch->patchDataSOA.numAtoms;
2751 #endif
2752  //
2753  // We could combine each case into a single loop with breaks,
2754  // with all faster forces also summed, like addForceToMomentum3().
2755  //
2756  // Things to consider:
2757  // - Do we always use acceleration (f/m) instead of just plain force?
2758  // Then we could instead buffer accel_slow, accel_nbond, etc.
2759  // - We will always need one multiply, since each dt includes
2760  // also a scaling factor.
2761  //
2762 
2763 #if 0
2764  if(this->patch->getPatchID() == 538){
2765  // fprintf(stderr, "Old Positions %lf %lf %lf\n", patch->patchDataSOA.pos_x[43], patch->patchDataSOA.pos_y[43], patch->patchDataSOA.pos_z[43]);
2766  // fprintf(stderr, "Old Velocities %lf %lf %lf\n", vel_x[43], vel_y[43], vel_z[ 43]);
2767  // fprintf(stderr, "Adding forces %lf %lf %lf %lf %lf %lf %lf %lf %lf\n",
2768  // f_slow_x[43], f_slow_y[43], f_slow_z[43],
2769  // f_nbond_x[43], f_nbond_y[43], f_nbond_z[43],
2770  // f_normal_x[43], f_normal_y[43], f_normal_z[43]);
2771  fprintf(stderr, "Old Positions %lf %lf %lf\n", patch->patchDataSOA.pos_x[0], patch->patchDataSOA.pos_y[0], patch->patchDataSOA.pos_z[0]);
2772  fprintf(stderr, "Old Velocities %lf %lf %lf\n", vel_x[0], vel_y[0], vel_z[ 0]);
2773  fprintf(stderr, "Adding forces %lf %lf %lf %lf %lf %lf %lf %lf %lf\n",
2774  f_slow_x[43], f_slow_y[43], f_slow_z[43],
2775  f_nbond_x[43], f_nbond_y[43], f_nbond_z[43],
2776  f_normal_x[43], f_normal_y[43], f_normal_z[43]);
2777  }
2778 #endif
2779  switch (maxForceNumber) {
2780  case Results::slow:
2781  dt_slow *= scaling;
2782  for (int i=0; i < numAtoms; i++) {
2783  vel_x[i] += f_slow_x[i] * recipMass[i] * dt_slow;
2784  vel_y[i] += f_slow_y[i] * recipMass[i] * dt_slow;
2785  vel_z[i] += f_slow_z[i] * recipMass[i] * dt_slow;
2786  }
2787  // fall through because we will always have the "faster" forces
2788  case Results::nbond:
2789  dt_nbond *= scaling;
2790  for (int i=0; i < numAtoms; i++) {
2791  vel_x[i] += f_nbond_x[i] * recipMass[i] * dt_nbond;
2792  vel_y[i] += f_nbond_y[i] * recipMass[i] * dt_nbond;
2793  vel_z[i] += f_nbond_z[i] * recipMass[i] * dt_nbond;
2794  }
2795  // fall through because we will always have the "faster" forces
2796  case Results::normal:
2797  dt_normal *= scaling;
2798  for (int i=0; i < numAtoms; i++) {
2799  vel_x[i] += f_normal_x[i] * recipMass[i] * dt_normal;
2800  vel_y[i] += f_normal_y[i] * recipMass[i] * dt_normal;
2801  vel_z[i] += f_normal_z[i] * recipMass[i] * dt_normal;
2802  }
2803  }
2804 }
2805 
2806 
2807 // XXX inline it?
2808 // XXX does not handle fixed atoms
2809 // Timestep: dt = scaling * (timestep / TIMEFACTOR);
2811  const double dt
2812 #ifndef SOA_SIMPLIFY_PARAMS
2813  ,
2814  const double * __restrict vel_x,
2815  const double * __restrict vel_y,
2816  const double * __restrict vel_z,
2817  double * __restrict pos_x,
2818  double * __restrict pos_y,
2819  double * __restrict pos_z,
2820  int numAtoms
2821 #endif
2822  ) {
2823  NAMD_EVENT_RANGE_2(patch->flags.event_on,
2824  NamdProfileEvent::ADD_VELOCITY_TO_POSITION_SOA);
2825 #ifdef SOA_SIMPLIFY_PARAMS
2826  const double * __restrict vel_x = patch->patchDataSOA.vel_x;
2827  const double * __restrict vel_y = patch->patchDataSOA.vel_y;
2828  const double * __restrict vel_z = patch->patchDataSOA.vel_z;
2829  double * __restrict pos_x = patch->patchDataSOA.pos_x;
2830  double * __restrict pos_y = patch->patchDataSOA.pos_y;
2831  double * __restrict pos_z = patch->patchDataSOA.pos_z;
2832  int numAtoms = patch->patchDataSOA.numAtoms;
2833 #endif
2834  for (int i=0; i < numAtoms; i++) {
2835  pos_x[i] += vel_x[i] * dt;
2836  pos_y[i] += vel_y[i] * dt;
2837  pos_z[i] += vel_z[i] * dt;
2838  }
2839 #if 0
2840  if(this->patch->getPatchID() == 538){
2841  fprintf(stderr, "New Positions %lf %lf %lf\n", pos_x[43], pos_y[43], pos_z[43]);
2842  fprintf(stderr, "New Velocities %lf %lf %lf\n", vel_x[43], vel_y[43], vel_z[43]);
2843  }
2844 #endif
2845 
2846 }
2847 
2848 
2850 #ifndef SOA_SIMPLIFY_PARAMS
2851  const int * __restrict hydrogenGroupSize,
2852  const float * __restrict mass,
2853  const double * __restrict vel_x,
2854  const double * __restrict vel_y,
2855  const double * __restrict vel_z,
2856  int numAtoms
2857 #endif
2858  ) {
2859  NAMD_EVENT_RANGE_2(patch->flags.event_on,
2860  NamdProfileEvent::SUBMIT_HALFSTEP_SOA);
2861 #ifdef SOA_SIMPLIFY_PARAMS
2862  const int * __restrict hydrogenGroupSize = patch->patchDataSOA.hydrogenGroupSize;
2863  const float * __restrict mass = patch->patchDataSOA.mass;
2864  const double * __restrict vel_x = patch->patchDataSOA.vel_x;
2865  const double * __restrict vel_y = patch->patchDataSOA.vel_y;
2866  const double * __restrict vel_z = patch->patchDataSOA.vel_z;
2867  int numAtoms = patch->patchDataSOA.numAtoms;
2868 #endif
2869  if ( 1 /* doKineticEnergy || patch->flags.doVirial */ ) {
2870  BigReal kineticEnergy = 0;
2871  Tensor virial;
2872  for (int i=0; i < numAtoms; i++) {
2873  // scalar kineticEnergy += mass[i] * vel[i]^2
2874  kineticEnergy += mass[i] *
2875  (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
2876  // tensor virial += mass[i] * outer_product(vel[i], vel[i])
2877  virial.xx += mass[i] * vel_x[i] * vel_x[i];
2878  virial.xy += mass[i] * vel_x[i] * vel_y[i];
2879  virial.xz += mass[i] * vel_x[i] * vel_z[i];
2880  virial.yx += mass[i] * vel_y[i] * vel_x[i];
2881  virial.yy += mass[i] * vel_y[i] * vel_y[i];
2882  virial.yz += mass[i] * vel_y[i] * vel_z[i];
2883  virial.zx += mass[i] * vel_z[i] * vel_x[i];
2884  virial.zy += mass[i] * vel_z[i] * vel_y[i];
2885  virial.zz += mass[i] * vel_z[i] * vel_z[i];
2886  }
2887  kineticEnergy *= 0.5 * 0.5;
2888  virial *= 0.5;
2889 
2890 #ifdef NODEGROUP_FORCE_REGISTER
2892  CUDASequencer->patchData->reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += kineticEnergy;
2893  ADD_TENSOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_VIRIAL_NORMAL,virial);
2894  }
2895  else{
2896 #endif
2898  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,virial);
2899 #ifdef NODEGROUP_FORCE_REGISTER
2900  }
2901 #endif
2902  }
2903 
2904  if ( 1 /* doKineticEnergy || patch->flags.doVirial */ ) {
2905  BigReal intKineticEnergy = 0;
2906  Tensor intVirialNormal;
2907  int hgs;
2908  for (int i=0; i < numAtoms; i += hgs) {
2909  // find velocity of center-of-mass of hydrogen group
2910  // calculate mass-weighted velocity
2911  hgs = hydrogenGroupSize[i];
2912  BigReal m_cm = 0;
2913  BigReal v_cm_x = 0;
2914  BigReal v_cm_y = 0;
2915  BigReal v_cm_z = 0;
2916  for (int j = i; j < (i+hgs); j++) {
2917  m_cm += mass[j];
2918  v_cm_x += mass[j] * vel_x[j];
2919  v_cm_y += mass[j] * vel_y[j];
2920  v_cm_z += mass[j] * vel_z[j];
2921  }
2922  BigReal recip_m_cm = 1.0 / m_cm;
2923  v_cm_x *= recip_m_cm;
2924  v_cm_y *= recip_m_cm;
2925  v_cm_z *= recip_m_cm;
2926  // sum virial contributions wrt vel center-of-mass
2927  for (int j = i; j < (i+hgs); j++) {
2928  BigReal dv_x = vel_x[j] - v_cm_x;
2929  BigReal dv_y = vel_y[j] - v_cm_y;
2930  BigReal dv_z = vel_z[j] - v_cm_z;
2931  // scalar intKineticEnergy += mass[j] * dot_product(vel[j], dv)
2932  intKineticEnergy += mass[j] *
2933  (vel_x[j] * dv_x + vel_y[j] * dv_y + vel_z[j] * dv_z);
2934  // tensor intVirialNormal += mass[j] * outer_product(vel[j], dv)
2935  intVirialNormal.xx += mass[j] * vel_x[j] * dv_x;
2936  intVirialNormal.xy += mass[j] * vel_x[j] * dv_y;
2937  intVirialNormal.xz += mass[j] * vel_x[j] * dv_z;
2938  intVirialNormal.yx += mass[j] * vel_y[j] * dv_x;
2939  intVirialNormal.yy += mass[j] * vel_y[j] * dv_y;
2940  intVirialNormal.yz += mass[j] * vel_y[j] * dv_z;
2941  intVirialNormal.zx += mass[j] * vel_z[j] * dv_x;
2942  intVirialNormal.zy += mass[j] * vel_z[j] * dv_y;
2943  intVirialNormal.zz += mass[j] * vel_z[j] * dv_z;
2944  }
2945  }
2946  intKineticEnergy *= 0.5 * 0.5;
2947  intVirialNormal *= 0.5;
2948 #ifdef NODEGROUP_FORCE_REGISTER
2950  CUDASequencer->patchData->reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY) += intKineticEnergy;
2951  ADD_TENSOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_INT_VIRIAL_NORMAL, intVirialNormal);
2952  }
2953  else{
2954 #endif
2956  += intKineticEnergy;
2957  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2958  intVirialNormal);
2959 #ifdef NODEGROUP_FORCE_REGISTER
2960  }
2961 #endif
2962  }
2963 }
2964 
2965 
2966 //
2967 // XXX
2968 //
2970 #ifndef SOA_SIMPLIFY_PARAMS
2971  const int * __restrict hydrogenGroupSize,
2972  const float * __restrict mass,
2973  const double * __restrict pos_x,
2974  const double * __restrict pos_y,
2975  const double * __restrict pos_z,
2976  const double * __restrict vel_x,
2977  const double * __restrict vel_y,
2978  const double * __restrict vel_z,
2979  const double * __restrict f_normal_x,
2980  const double * __restrict f_normal_y,
2981  const double * __restrict f_normal_z,
2982  const double * __restrict f_nbond_x,
2983  const double * __restrict f_nbond_y,
2984  const double * __restrict f_nbond_z,
2985  const double * __restrict f_slow_x,
2986  const double * __restrict f_slow_y,
2987  const double * __restrict f_slow_z,
2988  int numAtoms
2989 #endif
2990  ) {
2991  NAMD_EVENT_RANGE_2(patch->flags.event_on,
2992  NamdProfileEvent::SUBMIT_REDUCTIONS_SOA);
2993 #ifdef SOA_SIMPLIFY_PARAMS
2994  const int * __restrict hydrogenGroupSize = patch->patchDataSOA.hydrogenGroupSize;
2995  const float * __restrict mass = patch->patchDataSOA.mass;
2996  const double * __restrict pos_x = patch->patchDataSOA.pos_x;
2997  const double * __restrict pos_y = patch->patchDataSOA.pos_y;
2998  const double * __restrict pos_z = patch->patchDataSOA.pos_z;
2999  const double * __restrict vel_x = patch->patchDataSOA.vel_x;
3000  const double * __restrict vel_y = patch->patchDataSOA.vel_y;
3001  const double * __restrict vel_z = patch->patchDataSOA.vel_z;
3002  const double * __restrict f_normal_x = patch->patchDataSOA.f_normal_x;
3003  const double * __restrict f_normal_y = patch->patchDataSOA.f_normal_y;
3004  const double * __restrict f_normal_z = patch->patchDataSOA.f_normal_z;
3005  const double * __restrict f_nbond_x = patch->patchDataSOA.f_nbond_x;
3006  const double * __restrict f_nbond_y = patch->patchDataSOA.f_nbond_y;
3007  const double * __restrict f_nbond_z = patch->patchDataSOA.f_nbond_z;
3008  const double * __restrict f_slow_x = patch->patchDataSOA.f_slow_x;
3009  const double * __restrict f_slow_y = patch->patchDataSOA.f_slow_y;
3010  const double * __restrict f_slow_z = patch->patchDataSOA.f_slow_z;
3011  int numAtoms = patch->patchDataSOA.numAtoms;
3012 #endif
3013 
3014 #ifdef NODEGROUP_FORCE_REGISTER
3016  CUDASequencer->patchData->reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtoms;
3017  CUDASequencer->patchData->reduction->item(REDUCTION_MARGIN_VIOLATIONS) += patch->marginViolations;
3018  }else{
3019 #endif
3020  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtoms;
3022 #ifdef NODEGROUP_FORCE_REGISTER
3023  }
3024 #endif
3025 
3026  if ( 1 /* doKineticEnergy || doMomenta || patch->flags.doVirial */ ) {
3027  BigReal kineticEnergy = 0;
3028  BigReal momentum_x = 0;
3029  BigReal momentum_y = 0;
3030  BigReal momentum_z = 0;
3031  BigReal angularMomentum_x = 0;
3032  BigReal angularMomentum_y = 0;
3033  BigReal angularMomentum_z = 0;
3034  BigReal origin_x = patch->lattice.origin().x;
3035  BigReal origin_y = patch->lattice.origin().y;
3036  BigReal origin_z = patch->lattice.origin().z;
3037 
3038  // XXX pairInteraction
3039 
3040  for (int i=0; i < numAtoms; i++) {
3041 
3042  // scalar kineticEnergy += mass[i] * dot_product(vel[i], vel[i])
3043  kineticEnergy += mass[i] *
3044  (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
3045 
3046  // vector momentum += mass[i] * vel[i]
3047  momentum_x += mass[i] * vel_x[i];
3048  momentum_y += mass[i] * vel_y[i];
3049  momentum_z += mass[i] * vel_z[i];
3050 
3051  // vector dpos = pos[i] - origin
3052  BigReal dpos_x = pos_x[i] - origin_x;
3053  BigReal dpos_y = pos_y[i] - origin_y;
3054  BigReal dpos_z = pos_z[i] - origin_z;
3055 
3056  // vector angularMomentum += mass[i] * cross_product(dpos, vel[i])
3057  angularMomentum_x += mass[i] * (dpos_y*vel_z[i] - dpos_z*vel_y[i]);
3058  angularMomentum_y += mass[i] * (dpos_z*vel_x[i] - dpos_x*vel_z[i]);
3059  angularMomentum_z += mass[i] * (dpos_x*vel_y[i] - dpos_y*vel_x[i]);
3060  }
3061 
3062  // XXX missing Drude
3063 
3064  kineticEnergy *= 0.5;
3065  Vector momentum(momentum_x, momentum_y, momentum_z);
3066  Vector angularMomentum(angularMomentum_x, angularMomentum_y,
3067  angularMomentum_z);
3068 
3069 #ifdef NODEGROUP_FORCE_REGISTER
3071  CUDASequencer->patchData->reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += kineticEnergy;
3072  ADD_VECTOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_MOMENTUM,momentum);
3073  ADD_VECTOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
3074  }else{
3075 #endif
3077  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
3078  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
3079 #ifdef NODEGROUP_FORCE_REGISTER
3080  }
3081 #endif
3082  }
3083  // For non-Multigrator doKineticEnergy = 1 always
3084  if ( 1 /* doKineticEnergy || patch->flags.doVirial */ ) {
3085  BigReal intKineticEnergy = 0;
3086  Tensor intVirialNormal;
3087  Tensor intVirialNbond;
3088  Tensor intVirialSlow;
3089 
3090  int hgs = 1; // hydrogen group size
3091  for (int i=0; i < numAtoms; i += hgs) {
3092  hgs = hydrogenGroupSize[i];
3093  int j;
3094  BigReal m_cm = 0;
3095  BigReal r_cm_x = 0;
3096  BigReal r_cm_y = 0;
3097  BigReal r_cm_z = 0;
3098  BigReal v_cm_x = 0;
3099  BigReal v_cm_y = 0;
3100  BigReal v_cm_z = 0;
3101  for ( j = i; j < (i+hgs); ++j ) {
3102  m_cm += mass[j];
3103  r_cm_x += mass[j] * pos_x[j];
3104  r_cm_y += mass[j] * pos_y[j];
3105  r_cm_z += mass[j] * pos_z[j];
3106  v_cm_x += mass[j] * vel_x[j];
3107  v_cm_y += mass[j] * vel_y[j];
3108  v_cm_z += mass[j] * vel_z[j];
3109  }
3110  BigReal inv_m_cm = namd_reciprocal(m_cm);
3111  r_cm_x *= inv_m_cm;
3112  r_cm_y *= inv_m_cm;
3113  r_cm_z *= inv_m_cm;
3114  v_cm_x *= inv_m_cm;
3115  v_cm_y *= inv_m_cm;
3116  v_cm_z *= inv_m_cm;
3117 
3118  // XXX removed pairInteraction
3119  for ( j = i; j < (i+hgs); ++j ) {
3120  // XXX removed fixed atoms
3121 
3122  // vector vel[j] used twice below
3123  BigReal v_x = vel_x[j];
3124  BigReal v_y = vel_y[j];
3125  BigReal v_z = vel_z[j];
3126 
3127  // vector dv = vel[j] - v_cm
3128  BigReal dv_x = v_x - v_cm_x;
3129  BigReal dv_y = v_y - v_cm_y;
3130  BigReal dv_z = v_z - v_cm_z;
3131 
3132  // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
3133  intKineticEnergy += mass[j] *
3134  (v_x * dv_x + v_y * dv_y + v_z * dv_z);
3135 
3136  // vector dr = pos[j] - r_cm
3137  BigReal dr_x = pos_x[j] - r_cm_x;
3138  BigReal dr_y = pos_y[j] - r_cm_y;
3139  BigReal dr_z = pos_z[j] - r_cm_z;
3140 
3141  // tensor intVirialNormal += outer_product(f_normal[j], dr)
3142  intVirialNormal.xx += f_normal_x[j] * dr_x;
3143  intVirialNormal.xy += f_normal_x[j] * dr_y;
3144  intVirialNormal.xz += f_normal_x[j] * dr_z;
3145  intVirialNormal.yx += f_normal_y[j] * dr_x;
3146  intVirialNormal.yy += f_normal_y[j] * dr_y;
3147  intVirialNormal.yz += f_normal_y[j] * dr_z;
3148  intVirialNormal.zx += f_normal_z[j] * dr_x;
3149  intVirialNormal.zy += f_normal_z[j] * dr_y;
3150  intVirialNormal.zz += f_normal_z[j] * dr_z;
3151 
3152  // tensor intVirialNbond += outer_product(f_nbond[j], dr)
3153  intVirialNbond.xx += f_nbond_x[j] * dr_x;
3154  intVirialNbond.xy += f_nbond_x[j] * dr_y;
3155  intVirialNbond.xz += f_nbond_x[j] * dr_z;
3156  intVirialNbond.yx += f_nbond_y[j] * dr_x;
3157  intVirialNbond.yy += f_nbond_y[j] * dr_y;
3158  intVirialNbond.yz += f_nbond_y[j] * dr_z;
3159  intVirialNbond.zx += f_nbond_z[j] * dr_x;
3160  intVirialNbond.zy += f_nbond_z[j] * dr_y;
3161  intVirialNbond.zz += f_nbond_z[j] * dr_z;
3162 
3163  // tensor intVirialSlow += outer_product(f_slow[j], dr)
3164  intVirialSlow.xx += f_slow_x[j] * dr_x;
3165  intVirialSlow.xy += f_slow_x[j] * dr_y;
3166  intVirialSlow.xz += f_slow_x[j] * dr_z;
3167  intVirialSlow.yx += f_slow_y[j] * dr_x;
3168  intVirialSlow.yy += f_slow_y[j] * dr_y;
3169  intVirialSlow.yz += f_slow_y[j] * dr_z;
3170  intVirialSlow.zx += f_slow_z[j] * dr_x;
3171  intVirialSlow.zy += f_slow_z[j] * dr_y;
3172  intVirialSlow.zz += f_slow_z[j] * dr_z;
3173  }
3174  }
3175 
3176  intKineticEnergy *= 0.5;
3177 
3178 #ifdef NODEGROUP_FORCE_REGISTER
3180  // JM: Every PE will have its own copy of CUDASequencer, since it's a
3181  // group. However, they all share the same nodegroup pointer,
3182  // which means the reduction object is the same across all PEs
3183  CUDASequencer->patchData->reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += intKineticEnergy;
3184  ADD_TENSOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_INT_VIRIAL_NORMAL,intVirialNormal);
3185  ADD_TENSOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_INT_VIRIAL_NBOND,intVirialNbond);
3186  ADD_TENSOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_INT_VIRIAL_SLOW,intVirialSlow);
3187  }else{
3188 #endif
3190  ADD_TENSOR_OBJECT(reduction,REDUCTION_INT_VIRIAL_NORMAL,intVirialNormal);
3191  ADD_TENSOR_OBJECT(reduction,REDUCTION_INT_VIRIAL_NBOND,intVirialNbond);
3192  ADD_TENSOR_OBJECT(reduction,REDUCTION_INT_VIRIAL_SLOW,intVirialSlow);
3193 #ifdef NODEGROUP_FORCE_REGISTER
3194  }
3195 #endif
3196  }
3197  // XXX removed pressure profile
3198 
3199  // XXX removed fixed atoms
3200 
3202 
3203  // XXX removed pressure profile reduction
3204 }
3205 
3206 
3207 void Sequencer::submitCollections_SOA(int step, int zeroVel /* = 0 */)
3208 {
3209  //
3210  // Copy updates of SOA back into AOS for collections.
3211  //
3212  // XXX Could update positions and velocities separately.
3213  //
3214  NAMD_EVENT_RANGE_2(patch->flags.event_on,
3215  NamdProfileEvent::SUBMIT_COLLECTIONS_SOA);
3216  //
3217  // XXX Poor implementation here!
3218  // The selector functions called below in Output.C are
3219  // doing several tests and in an average use case calculating
3220  // at least two mod functions.
3221  //
3222  // However, most steps are NOT output steps!
3223  //
3224  int is_pos_needed;
3225  int dcdIndex;
3226  std::tie(is_pos_needed, dcdIndex)= Output::coordinateNeeded(step);
3227  int is_vel_needed = Output::velocityNeeded(step);
3228  int is_f_needed = Output::forceNeeded(step);
3229  if (!simParams->useDeviceMigration) { // This is already done for GPU migration
3230  if ( is_pos_needed || is_vel_needed ) {
3231  patch->copy_updates_to_AOS();
3232  }
3233  if (is_f_needed) {
3234  patch->copy_forces_to_AOS();
3235  }
3236  }
3237  if ( is_pos_needed ) {
3238 #ifndef MEM_OPT_VERSION
3239  collection->submitPositions(step,patch->atom,patch->lattice,is_pos_needed,dcdIndex);
3240 #else
3241  collection->submitPositions(step,patch->atom,patch->lattice,is_pos_needed);
3242 #endif
3243  }
3244  if ( is_vel_needed ) {
3245  collection->submitVelocities(step,zeroVel,patch->atom);
3246  }
3247  if ( is_f_needed ) {
3248  int maxForceUsed = patch->flags.maxForceUsed;
3249  if ( maxForceUsed > Results::slow ) maxForceUsed = Results::slow;
3250  collection->submitForces(step,patch->atom,maxForceUsed,patch->f);
3251  }
3252 }
3253 
3254 
3256  const double dt,
3257  const double maxvel2
3258 #ifndef SOA_SIMPLIFY_PARAMS
3259  ,
3260  const double * __restrict vel_x,
3261  const double * __restrict vel_y,
3262  const double * __restrict vel_z,
3263  int numAtoms
3264 #endif
3265  ) {
3266  NAMD_EVENT_RANGE_2(patch->flags.event_on, NamdProfileEvent::MAXIMUM_MOVE_SOA);
3267 #ifdef SOA_SIMPLIFY_PARAMS
3268  const double * __restrict vel_x = patch->patchDataSOA.vel_x;
3269  const double * __restrict vel_y = patch->patchDataSOA.vel_y;
3270  const double * __restrict vel_z = patch->patchDataSOA.vel_z;
3271  int numAtoms = patch->patchDataSOA.numAtoms;
3272 #endif
3273 
3274  // XXX missing maximum move
3275 
3276  // Loop vectorizes when replacing logical OR with summing.
3277  int killme = 0;
3278  for (int i=0; i < numAtoms; i++) {
3279  BigReal vel2 =
3280  vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3281  killme = killme + ( vel2 > maxvel2 );
3282  }
3283  if (killme) {
3284  // Found at least one atom that is moving too fast.
3285  // Terminating, so loop performance below doesn't matter.
3286  // Loop does not vectorize.
3287  killme = 0;
3288  for (int i=0; i < numAtoms; i++) {
3289  BigReal vel2 =
3290  vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3291  if (vel2 > maxvel2) {
3292  const FullAtom *a = patch->atom.begin();
3293  const Vector vel(vel_x[i], vel_y[i], vel_z[i]);
3294  const BigReal maxvel = sqrt(maxvel2);
3295  ++killme;
3296  iout << iERROR << "Atom " << (a[i].id + 1) << " velocity is "
3297  << ( PDBVELFACTOR * vel ) << " (limit is "
3298  << ( PDBVELFACTOR * maxvel ) << ", atom "
3299  << i << " of " << numAtoms << " on patch "
3300  << patch->patchID << " pe " << CkMyPe() << ")\n" << endi;
3301  }
3302  }
3303  iout << iERROR <<
3304  "Atoms moving too fast; simulation has become unstable ("
3305  << killme << " atoms on patch " << patch->patchID
3306  << " pe " << CkMyPe() << ").\n" << endi;
3308  terminate();
3309  }
3310 }
3311 
3312 
3314  BigReal timestep
3315 #ifndef SOA_SIMPLIFY_PARAMS
3316  ,
3317  const float * __restrict langevinParam,
3318  double * __restrict vel_x,
3319  double * __restrict vel_y,
3320  double * __restrict vel_z,
3321  int numAtoms
3322 #endif
3323  ) {
3324  NAMD_EVENT_RANGE_2(patch->flags.event_on,
3325  NamdProfileEvent::LANGEVIN_VELOCITIES_BBK1_SOA);
3326 #ifdef SOA_SIMPLIFY_PARAMS
3327  const float * __restrict langevinParam = patch->patchDataSOA.langevinParam;
3328  double * __restrict vel_x = patch->patchDataSOA.vel_x;
3329  double * __restrict vel_y = patch->patchDataSOA.vel_y;
3330  double * __restrict vel_z = patch->patchDataSOA.vel_z;
3331  int numAtoms = patch->patchDataSOA.numAtoms;
3332 #endif
3333  if ( simParams->langevinOn /* && !simParams->langevin_useBAOAB */ )
3334  {
3335  // scale by TIMEFACTOR to convert to fs and then by 0.001 to ps
3336  // multiply by the Langevin damping coefficient, units 1/ps
3337  // XXX we could instead store time-scaled Langevin parameters
3338  BigReal dt = timestep * (0.001 * TIMEFACTOR);
3339 
3340  // XXX missing Drude
3341 
3342  //
3343  // The conditional inside loop prevents vectorization and doesn't
3344  // avoid much work since addition and multiplication are cheap.
3345  //
3346  for (int i=0; i < numAtoms; i++) {
3347  BigReal dt_gamma = dt * langevinParam[i];
3348  //if ( ! dt_gamma ) continue;
3349 
3350  BigReal scaling = 1. - 0.5 * dt_gamma;
3351  vel_x[i] *= scaling;
3352  vel_y[i] *= scaling;
3353  vel_z[i] *= scaling;
3354  }
3355  } // end if langevinOn
3356 }
3357 
3358 
3360  BigReal timestep
3361 #ifndef SOA_SIMPLIFY_PARAMS
3362  ,
3363  const float * __restrict langevinParam,
3364  const float * __restrict langScalVelBBK2,
3365  const float * __restrict langScalRandBBK2,
3366  float * __restrict gaussrand_x,
3367  float * __restrict gaussrand_y,
3368  float * __restrict gaussrand_z,
3369  double * __restrict vel_x,
3370  double * __restrict vel_y,
3371  double * __restrict vel_z,
3372  int numAtoms
3373 #endif
3374  )
3375 {
3376  NAMD_EVENT_RANGE_2(patch->flags.event_on,
3377  NamdProfileEvent::LANGEVIN_VELOCITIES_BBK2_SOA);
3378 #ifdef SOA_SIMPLIFY_PARAMS
3379  const float * __restrict langevinParam = patch->patchDataSOA.langevinParam;
3380  const float * __restrict langScalVelBBK2 = patch->patchDataSOA.langScalVelBBK2;
3381  const float * __restrict langScalRandBBK2 = patch->patchDataSOA.langScalRandBBK2;
3382  float * __restrict gaussrand_x = patch->patchDataSOA.gaussrand_x;
3383  float * __restrict gaussrand_y = patch->patchDataSOA.gaussrand_y;
3384  float * __restrict gaussrand_z = patch->patchDataSOA.gaussrand_z;
3385  double * __restrict vel_x = patch->patchDataSOA.vel_x;
3386  double * __restrict vel_y = patch->patchDataSOA.vel_y;
3387  double * __restrict vel_z = patch->patchDataSOA.vel_z;
3388  int numAtoms = patch->patchDataSOA.numAtoms;
3389 #endif
3390  if ( simParams->langevinOn /* && !simParams->langevin_useBAOAB */ )
3391  {
3392  // XXX missing Drude
3393 
3394  // Scale by TIMEFACTOR to convert to fs and then by 0.001 to ps
3395  // multiply by the Langevin damping coefficient, units 1/ps.
3396  // XXX we could instead store time-scaled Langevin parameters
3397  BigReal dt = timestep * (0.001 * TIMEFACTOR);
3398  // Buffer the Gaussian random numbers
3400  // Must re-satisfy constraints if Langevin gammas differ.
3401  // (conserve momentum?)
3402  TIMER_START(patch->timerSet, RATTLE1);
3403  rattle1_SOA(timestep, 1);
3404  TIMER_STOP(patch->timerSet, RATTLE1);
3405  //
3406  // We don't need random numbers for atoms such that gamma=0.
3407  // If gammas differ, the likely case is that we aren't applying
3408  // Langevin damping to hydrogen, making those langevinParam=0,
3409  // in which case we need only numAtoms/3 random vectors.
3410  //
3411  // XXX can refine code below, count in advance how many
3412  // random numbers we need to use Random array filling routine
3413  //
3414  // XXX Loop does not vectorize!
3415  for (int i=0; i < numAtoms; i++) {
3416  Vector rg; // = 0
3417  if (langevinParam[i] != 0) rg = random->gaussian_vector();
3418  gaussrand_x[i] = float(rg.x);
3419  gaussrand_y[i] = float(rg.y);
3420  gaussrand_z[i] = float(rg.z);
3421  }
3422  }
3423  else {
3424  // Need to completely fill random number arrays.
3425  random->gaussian_array_f(gaussrand_x, numAtoms);
3426  random->gaussian_array_f(gaussrand_y, numAtoms);
3427  random->gaussian_array_f(gaussrand_z, numAtoms);
3428  }
3429 
3430  // do the velocity updates
3431  for (int i=0; i < numAtoms; i++) {
3432  vel_x[i] += gaussrand_x[i] * langScalRandBBK2[i];
3433  vel_y[i] += gaussrand_y[i] * langScalRandBBK2[i];
3434  vel_z[i] += gaussrand_z[i] * langScalRandBBK2[i];
3435  vel_x[i] *= langScalVelBBK2[i];
3436  vel_y[i] *= langScalVelBBK2[i];
3437  vel_z[i] *= langScalVelBBK2[i];
3438  }
3439  } // end if langevinOn
3440 }
3441 
3443 #ifndef SOA_SIMPLIFY_PARAMS
3444  const int * __restrict hydrogenGroupSize,
3445  const float * __restrict mass,
3446  double * __restrict pos_x,
3447  double * __restrict pos_y,
3448  double * __restrict pos_z,
3449  int numAtoms,
3450 #endif
3451  int step)
3452 {
3453 #ifdef SOA_SIMPLIFY_PARAMS
3454  const int * __restrict hydrogenGroupSize = patch->patchDataSOA.hydrogenGroupSize;
3455  const float * __restrict mass = patch->patchDataSOA.mass;
3456  double * __restrict pos_x = patch->patchDataSOA.pos_x;
3457  double * __restrict pos_y = patch->patchDataSOA.pos_y;
3458  double * __restrict pos_z = patch->patchDataSOA.pos_z;
3459  int numAtoms = patch->patchDataSOA.numAtoms;
3460 #endif
3461 
3462  //
3463  // Loops below simplify if we lift out special cases of fixed atoms
3464  // and pressure excluded atoms and make them their own branch.
3465  //
3466 
3470  // Blocking receive for the updated lattice scaling factor.
3471  Tensor factor = broadcast->positionRescaleFactor.get(step);
3472  patch->lattice.rescale(factor);
3473  Vector origin = patch->lattice.origin();
3474 
3475  if ( simParams->useGroupPressure ) {
3476  int hgs;
3477  for (int i = 0; i < numAtoms; i += hgs) {
3478  int j;
3479  hgs = hydrogenGroupSize[i];
3480  // missing fixed atoms implementation
3481  BigReal m_cm = 0;
3482  BigReal r_cm_x = 0;
3483  BigReal r_cm_y = 0;
3484  BigReal r_cm_z = 0;
3485  // calculate the center of mass
3486  for ( j = i; j < (i+hgs); ++j ) {
3487  m_cm += mass[j];
3488  r_cm_x += mass[j] * pos_x[j];
3489  r_cm_y += mass[j] * pos_y[j];
3490  r_cm_z += mass[j] * pos_z[j];
3491  }
3492  BigReal inv_m_cm = namd_reciprocal(m_cm);
3493  r_cm_x *= inv_m_cm;
3494  r_cm_y *= inv_m_cm;
3495  r_cm_z *= inv_m_cm;
3496  // scale the center of mass with factor
3497  // shift to origin
3498  double tx = r_cm_x - origin.x;
3499  double ty = r_cm_y - origin.y;
3500  double tz = r_cm_z - origin.z;
3501  // apply transformation
3502  double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3503  double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3504  double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3505  // shift back
3506  new_r_cm_x += origin.x;
3507  new_r_cm_y += origin.y;
3508  new_r_cm_z += origin.z;
3509  // translation vector from old COM and new COM
3510  double delta_r_cm_x = new_r_cm_x - r_cm_x;
3511  double delta_r_cm_y = new_r_cm_y - r_cm_y;
3512  double delta_r_cm_z = new_r_cm_z - r_cm_z;
3513  // shift the hydrogen group with translation vector
3514  for (j = i; j < (i+hgs); ++j) {
3515  pos_x[j] += delta_r_cm_x;
3516  pos_y[j] += delta_r_cm_y;
3517  pos_z[j] += delta_r_cm_z;
3518  }
3519  }
3520  } else {
3521  for (int i = 0; i < numAtoms; ++i) {
3522  // missing fixed atoms implementation
3523  // scale the coordinates with factor
3524  // shift to origin
3525  double tx = pos_x[i] - origin.x;
3526  double ty = pos_y[i] - origin.y;
3527  double tz = pos_z[i] - origin.z;
3528  // apply transformation
3529  double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3530  double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3531  double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3532  // shift back
3533  pos_x[i] = ftx + origin.x;
3534  pos_y[i] = fty + origin.y;
3535  pos_z[i] = ftz + origin.z;
3536  }
3537  }
3538  }
3539 }
3540 
3542 #ifndef SOA_SIMPLIFY_PARAMS
3543  const int * __restrict hydrogenGroupSize,
3544  const float * __restrict mass,
3545  double * __restrict pos_x,
3546  double * __restrict pos_y,
3547  double * __restrict pos_z,
3548  double * __restrict vel_x,
3549  double * __restrict vel_y,
3550  double * __restrict vel_z,
3551  int numAtoms,
3552 #endif
3553  int step
3554  )
3555 {
3556 #ifdef SOA_SIMPLIFY_PARAMS
3557  const int * __restrict hydrogenGroupSize = patch->patchDataSOA.hydrogenGroupSize;
3558  const float * __restrict mass = patch->patchDataSOA.mass;
3559  double * __restrict pos_x = patch->patchDataSOA.pos_x;
3560  double * __restrict pos_y = patch->patchDataSOA.pos_y;
3561  double * __restrict pos_z = patch->patchDataSOA.pos_z;
3562  double * __restrict vel_x = patch->patchDataSOA.vel_x;
3563  double * __restrict vel_y = patch->patchDataSOA.vel_y;
3564  double * __restrict vel_z = patch->patchDataSOA.vel_z;
3565  int numAtoms = patch->patchDataSOA.numAtoms;
3566 #endif
3567 
3568  //
3569  // Loops below simplify if we lift out special cases of fixed atoms
3570  // and pressure excluded atoms and make them their own branch.
3571  //
3572 
3573  // Blocking receive for the updated lattice scaling factor.
3574 
3575  Tensor factor = broadcast->positionRescaleFactor.get(step);
3576 
3577  TIMER_START(patch->timerSet, PISTON);
3578  // JCP FIX THIS!!!
3579  double velFactor_x = namd_reciprocal(factor.xx);
3580  double velFactor_y = namd_reciprocal(factor.yy);
3581  double velFactor_z = namd_reciprocal(factor.zz);
3582  patch->lattice.rescale(factor);
3583  Vector origin = patch->lattice.origin();
3584  if ( simParams->useGroupPressure ) {
3585  int hgs;
3586  for (int i=0; i < numAtoms; i += hgs) {
3587  int j;
3588  hgs = hydrogenGroupSize[i];
3589  // missing fixed atoms
3590  BigReal m_cm = 0;
3591  BigReal r_cm_x = 0;
3592  BigReal r_cm_y = 0;
3593  BigReal r_cm_z = 0;
3594  BigReal v_cm_x = 0;
3595  BigReal v_cm_y = 0;
3596  BigReal v_cm_z = 0;
3597  for ( j = i; j < (i+hgs); ++j ) {
3598  m_cm += mass[j];
3599  r_cm_x += mass[j] * pos_x[j];
3600  r_cm_y += mass[j] * pos_y[j];
3601  r_cm_z += mass[j] * pos_z[j];
3602  v_cm_x += mass[j] * vel_x[j];
3603  v_cm_y += mass[j] * vel_y[j];
3604  v_cm_z += mass[j] * vel_z[j];
3605  }
3606  BigReal inv_m_cm = namd_reciprocal(m_cm);
3607  r_cm_x *= inv_m_cm;
3608  r_cm_y *= inv_m_cm;
3609  r_cm_z *= inv_m_cm;
3610 
3611  double tx = r_cm_x - origin.x;
3612  double ty = r_cm_y - origin.y;
3613  double tz = r_cm_z - origin.z;
3614  double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3615  double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3616  double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3617  new_r_cm_x += origin.x;
3618  new_r_cm_y += origin.y;
3619  new_r_cm_z += origin.z;
3620 
3621  double delta_r_cm_x = new_r_cm_x - r_cm_x;
3622  double delta_r_cm_y = new_r_cm_y - r_cm_y;
3623  double delta_r_cm_z = new_r_cm_z - r_cm_z;
3624  v_cm_x *= inv_m_cm;
3625  v_cm_y *= inv_m_cm;
3626  v_cm_z *= inv_m_cm;
3627  double delta_v_cm_x = ( velFactor_x - 1 ) * v_cm_x;
3628  double delta_v_cm_y = ( velFactor_y - 1 ) * v_cm_y;
3629  double delta_v_cm_z = ( velFactor_z - 1 ) * v_cm_z;
3630  for (j = i; j < (i+hgs); j++) {
3631  pos_x[j] += delta_r_cm_x;
3632  pos_y[j] += delta_r_cm_y;
3633  pos_z[j] += delta_r_cm_z;
3634  vel_x[j] += delta_v_cm_x;
3635  vel_y[j] += delta_v_cm_y;
3636  vel_z[j] += delta_v_cm_z;
3637  }
3638  // if (i < 10)
3639  // printf("cpu: %d, %f, %f, %f, %f, %f, %f\n", i,
3640  // pos_x[i], pos_y[i], pos_z[i],
3641  // vel_x[i], vel_y[i], vel_z[i]);
3642  }
3643  }
3644  else {
3645  for (int i=0; i < numAtoms; i++) {
3646  double tx = pos_x[i] - origin.x;
3647  double ty = pos_y[i] - origin.y;
3648  double tz = pos_z[i] - origin.z;
3649  double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3650  double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3651  double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3652  pos_x[i] = ftx + origin.x;
3653  pos_y[i] = fty + origin.y;
3654  pos_z[i] = ftz + origin.z;
3655  vel_x[i] *= velFactor_x;
3656  vel_y[i] *= velFactor_y;
3657  vel_z[i] *= velFactor_z;
3658  // if (i < 10)
3659  // printf("cpu: %d, %f, %f, %f, %f, %f, %f\n", i,
3660  // pos_x[i], pos_y[i], pos_z[i],
3661  // vel_x[i], vel_y[i], vel_z[i]);
3662  }
3663  }
3664  TIMER_STOP(patch->timerSet, PISTON);
3665  // exit(0);
3666 }
3667 
3668 
3669 // timestep scaled by 1/TIMEFACTOR
3670 void Sequencer::rattle1_SOA(BigReal timestep, int pressure)
3671 {
3672  NAMD_EVENT_RANGE_2(patch->flags.event_on, NamdProfileEvent::RATTLE1_SOA);
3673  if ( simParams->rigidBonds != RIGID_NONE ) {
3674  Tensor virial;
3675  Tensor *vp = ( pressure ? &virial : 0 );
3676  // XXX pressureProfileReduction == NULL?
3677  if ( patch->rattle1_SOA(timestep, vp, pressureProfileReduction) ) {
3678  iout << iERROR <<
3679  "Constraint failure; simulation has become unstable.\n" << endi;
3681  terminate();
3682  }
3683 #ifdef NODEGROUP_FORCE_REGISTER
3685  ADD_TENSOR_OBJECT(CUDASequencer->patchData->reduction,REDUCTION_VIRIAL_NORMAL,virial);
3686  }
3687  else{
3688 #endif
3689  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,virial);
3690 #ifdef NODEGROUP_FORCE_REGISTER
3691  }
3692 #endif
3693  }
3694 }
3695 
3696 void Sequencer::runComputeObjects_SOA(int migration, int pairlists, int nstep)
3697 {
3698  if ( migration ) pairlistsAreValid = 0;
3699 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) || defined(NAMD_MIC)
3700  if ( pairlistsAreValid &&
3702  && ( pairlistsAge > pairlistsAgeLimit ) ) {
3703  pairlistsAreValid = 0;
3704  }
3705 #else
3707  pairlistsAreValid = 0;
3708  }
3709 #endif
3710  if ( ! simParams->usePairlists ) pairlists = 0;
3711  patch->flags.usePairlists = pairlists || pairlistsAreValid;
3712  patch->flags.savePairlists = pairlists && ! pairlistsAreValid;
3713 
3714 #if defined(NTESTPID)
3715  if (1 && patch->patchID == NTESTPID) {
3716  int step = patch->flags.step;
3717  int numAtoms = patch->numAtoms;
3718  double *xyzq = new double[4*numAtoms];
3719  double *x = patch->patchDataSOA.pos_x;
3720  double *y = patch->patchDataSOA.pos_y;
3721  double *z = patch->patchDataSOA.pos_z;
3722  float *q = patch->patchDataSOA.charge;
3723  for (int i=0; i < numAtoms; i++) {
3724  xyzq[4*i ] = x[i];
3725  xyzq[4*i+1] = y[i];
3726  xyzq[4*i+2] = z[i];
3727  xyzq[4*i+3] = q[i];
3728  }
3729  char fname[128], remark[128];
3730  sprintf(fname, "xyzq_soa_pid%d_step%d.bin", NTESTPID, step);
3731  sprintf(remark, "SOA xyzq, patch %d, step %d", NTESTPID, step);
3732  TestArray_write<double>(fname, remark, xyzq, 4*numAtoms);
3733  delete[] xyzq;
3734  }
3735 #endif
3736  // Zero all SOA global forces before computing force
3737  patch->zero_global_forces_SOA();
3738  patch->positionsReady_SOA(migration); // updates flags.sequence
3739 
3740  int seq = patch->flags.sequence;
3741  int basePriority = ( (seq & 0xffff) << 15 )
3743 
3744  // XXX missing GBIS
3745  priority = basePriority + COMPUTE_HOME_PRIORITY;
3746  //char prbuf[32];
3747  //sprintf(prbuf, "%s: %d", NamdProfileEventStr[NamdProfileEvent::SEQ_SUSPEND], patch->getPatchID());
3748  //NAMD_EVENT_START_EX(1, NamdProfileEvent::SEQ_SUSPEND, prbuf);
3749  suspend(); // until all deposit boxes close
3750  //NAMD_EVENT_STOP(1, NamdProfileEvent::SEQ_SUSPEND);
3751 
3752 #ifdef NODEGROUP_FORCE_REGISTER
3753  if(!simParams->CUDASOAintegrate || migration){
3754  patch->copy_forces_to_SOA();
3755  }
3756 #else
3757  patch->copy_forces_to_SOA();
3758 #endif
3759 
3760 #if defined(NTESTPID)
3761  if (1 && patch->patchID == NTESTPID) {
3762  int step = patch->flags.step;
3763  int numAtoms = patch->numAtoms;
3764  char fname[128];
3765  char remark[128];
3766  double *fxyz = new double[3*numAtoms];
3767  double *fx = patch->patchDataSOA.f_normal_x;
3768  double *fy = patch->patchDataSOA.f_normal_y;
3769  double *fz = patch->patchDataSOA.f_normal_z;
3770  for (int i=0; i < numAtoms; i++) {
3771  fxyz[3*i ] = fx[i];
3772  fxyz[3*i+1] = fy[i];
3773  fxyz[3*i+2] = fz[i];
3774  }
3775  sprintf(fname, "fxyz_normal_soa_pid%d_step%d.bin", NTESTPID, step);
3776  sprintf(remark, "SOA fxyz normal, patch %d, step %d", NTESTPID, step);
3777  TestArray_write<double>(fname, remark, fxyz, 3*numAtoms);
3778  fx = patch->patchDataSOA.f_nbond_x;
3779  fy = patch->patchDataSOA.f_nbond_y;
3780  fz = patch->patchDataSOA.f_nbond_z;
3781  for (int i=0; i < numAtoms; i++) {
3782  fxyz[3*i ] = fx[i];
3783  fxyz[3*i+1] = fy[i];
3784  fxyz[3*i+2] = fz[i];
3785  }
3786  sprintf(fname, "fxyz_nbond_soa_pid%d_step%d.bin", NTESTPID, step);
3787  sprintf(remark, "SOA fxyz nonbonded, patch %d, step %d", NTESTPID, step);
3788  TestArray_write<double>(fname, remark, fxyz, 3*numAtoms);
3789  fx = patch->patchDataSOA.f_slow_x;
3790  fy = patch->patchDataSOA.f_slow_y;
3791  fz = patch->patchDataSOA.f_slow_z;
3792  for (int i=0; i < numAtoms; i++) {
3793  fxyz[3*i ] = fx[i];
3794  fxyz[3*i+1] = fy[i];
3795  fxyz[3*i+2] = fz[i];
3796  }
3797  sprintf(fname, "fxyz_slow_soa_pid%d_step%d.bin", NTESTPID, step);
3798  sprintf(remark, "SOA fxyz slow, patch %d, step %d", NTESTPID, step);
3799  TestArray_write<double>(fname, remark, fxyz, 3*numAtoms);
3800  delete[] fxyz;
3801  }
3802 #endif
3803 
3804 #if 0
3805  if (1 && patch->patchID == 0) {
3806  int numAtoms = patch->numAtoms;
3807  double *fxyz = new double[3*numAtoms];
3808  double *fx, *fy, *fz;
3809  char fname[64], remark[128];
3810  int step = patch->flags.step;
3811 
3812  fx = patch->patchDataSOA.f_slow_x;
3813  fy = patch->patchDataSOA.f_slow_y;
3814  fz = patch->patchDataSOA.f_slow_z;
3815  for (int i=0; i < numAtoms; i++) {
3816  fxyz[3*i ] = fx[i];
3817  fxyz[3*i+1] = fy[i];
3818  fxyz[3*i+2] = fz[i];
3819  }
3820  sprintf(fname, "fslow_soa_%d.bin", step);
3821  sprintf(remark, "SOA slow forces, step %d\n", step);
3822  TestArray_write<double>(fname, remark, fxyz, 3*numAtoms);
3823 
3824  fx = patch->patchDataSOA.f_nbond_x;
3825  fy = patch->patchDataSOA.f_nbond_y;
3826  fz = patch->patchDataSOA.f_nbond_z;
3827  for (int i=0; i < numAtoms; i++) {
3828  fxyz[3*i ] = fx[i];
3829  fxyz[3*i+1] = fy[i];
3830  fxyz[3*i+2] = fz[i];
3831  }
3832  sprintf(fname, "fnbond_soa_%d.bin", step);
3833  sprintf(remark, "SOA nonbonded forces, step %d\n", step);
3834  TestArray_write<double>(fname, remark, fxyz, 3*numAtoms);
3835 
3836  fx = patch->patchDataSOA.f_normal_x;
3837  fy = patch->patchDataSOA.f_normal_y;
3838  fz = patch->patchDataSOA.f_normal_z;
3839  for (int i=0; i < numAtoms; i++) {
3840  fxyz[3*i ] = fx[i];
3841  fxyz[3*i+1] = fy[i];
3842  fxyz[3*i+2] = fz[i];
3843  }
3844  sprintf(fname, "fnormal_soa_%d.bin", step);
3845  sprintf(remark, "SOA normal forces, step %d\n", step);
3846  TestArray_write<double>(fname, remark, fxyz, 3*numAtoms);
3847 
3848  delete[] fxyz;
3849  }
3850 #endif
3851 
3852 #if 0
3853  //Will print forces here after runComputeObjects
3854  if(nstep == 1){
3855  fprintf(stderr, "CPU force arrays for alanin\n" );
3856  for(int i = 0; i < patch->patchDataSOA.numAtoms; i++){
3857  fprintf(stderr, "f[%i] = %lf %lf %lf | %lf %lf %lf | %lf %lf %lf\n", i,
3858  patch->patchDataSOA.f_normal_x[i], patch->patchDataSOA.f_normal_y[i], patch->patchDataSOA.f_normal_z[i],
3859  patch->patchDataSOA.f_nbond_x[i], patch->patchDataSOA.f_nbond_y[i], patch->patchDataSOA.f_nbond_z[i],
3860  patch->patchDataSOA.f_slow_x[i], patch->patchDataSOA.f_slow_y[i], patch->patchDataSOA.f_slow_z[i]);
3861  }
3862  }
3863 #endif
3864 
3866  pairlistsAreValid = 1;
3867  pairlistsAge = 0;
3868  }
3869  // For multigrator, do not age pairlist during pressure step
3870  // NOTE: for non-multigrator pressureStep = 0 always
3871  if ( pairlistsAreValid /* && !pressureStep */ ) ++pairlistsAge;
3872 
3873  // XXX missing lonepairs
3874  // XXX missing Molly
3875  // XXX missing Lowe-Andersen
3876 }
3877 
3883 {
3886  double * __restrict vel_x = patch->patchDataSOA.vel_x;
3887  double * __restrict vel_y = patch->patchDataSOA.vel_y;
3888  double * __restrict vel_z = patch->patchDataSOA.vel_z;
3889  int numAtoms = patch->patchDataSOA.numAtoms;
3890  // Blocking receive for the temperature coupling coefficient.
3891  BigReal velrescaling = broadcast->stochRescaleCoefficient.get(step);
3892  DebugM(4, "stochastically rescaling velocities at step " << step << " by " << velrescaling << "\n");
3893  for ( int i = 0; i < numAtoms; ++i ) {
3894  vel_x[i] *= velrescaling;
3895  vel_y[i] *= velrescaling;
3896  vel_z[i] *= velrescaling;
3897  }
3898  stochRescale_count = 0;
3899  }
3900 }
3901 
3902 //
3903 // end SOA code
3904 //
3906 
3907 #endif // SEQUENCER_SOA
3908 
3909 
3910 extern int eventEndOfTimeStep;
3911 
3912 void Sequencer::integrate(int scriptTask) {
3913  char traceNote[24];
3914  char tracePrefix[20];
3915  sprintf(tracePrefix, "p:%d,s:",patch->patchID);
3916 // patch->write_tip4_props();
3917 
3918  //
3919  // DJH: Copy all data into SOA (structure of arrays)
3920  // from AOS (array of structures) data structure.
3921  //
3922  //patch->copy_all_to_SOA();
3923 
3924 #ifdef TIMER_COLLECTION
3925  TimerSet& t = patch->timerSet;
3926 #endif
3927  TIMER_INIT_WIDTH(t, KICK, simParams->timerBinWidth);
3928  TIMER_INIT_WIDTH(t, MAXMOVE, simParams->timerBinWidth);
3929  TIMER_INIT_WIDTH(t, DRIFT, simParams->timerBinWidth);
3930  TIMER_INIT_WIDTH(t, PISTON, simParams->timerBinWidth);
3931  TIMER_INIT_WIDTH(t, SUBMITHALF, simParams->timerBinWidth);
3932  TIMER_INIT_WIDTH(t, VELBBK1, simParams->timerBinWidth);
3933  TIMER_INIT_WIDTH(t, VELBBK2, simParams->timerBinWidth);
3934  TIMER_INIT_WIDTH(t, RATTLE1, simParams->timerBinWidth);
3935  TIMER_INIT_WIDTH(t, SUBMITFULL, simParams->timerBinWidth);
3936  TIMER_INIT_WIDTH(t, SUBMITCOLLECT, simParams->timerBinWidth);
3937 
3938  int &step = patch->flags.step;
3939  step = simParams->firstTimestep;
3940 
3941  // drag switches
3942  const Bool rotDragOn = simParams->rotDragOn;
3943  const Bool movDragOn = simParams->movDragOn;
3944 
3945  const int commOnly = simParams->commOnly;
3946 
3947  int &maxForceUsed = patch->flags.maxForceUsed;
3948  int &maxForceMerged = patch->flags.maxForceMerged;
3949  maxForceUsed = Results::normal;
3950  maxForceMerged = Results::normal;
3951 
3952  const int numberOfSteps = simParams->N;
3953  const int stepsPerCycle = simParams->stepsPerCycle;
3954  const BigReal timestep = simParams->dt;
3955 
3956  // what MTS method?
3957  const int staleForces = ( simParams->MTSAlgorithm == NAIVE );
3958 
3959  const int nonbondedFrequency = simParams->nonbondedFrequency;
3960  slowFreq = nonbondedFrequency;
3961  const BigReal nbondstep = timestep * (staleForces?1:nonbondedFrequency);
3962  int &doNonbonded = patch->flags.doNonbonded;
3963  doNonbonded = (step >= numberOfSteps) || !(step%nonbondedFrequency);
3964  if ( nonbondedFrequency == 1 ) maxForceMerged = Results::nbond;
3965  if ( doNonbonded ) maxForceUsed = Results::nbond;
3966 
3967  // Do we do full electrostatics?
3968  const int dofull = ( simParams->fullElectFrequency ? 1 : 0 );
3969  const int fullElectFrequency = simParams->fullElectFrequency;
3970  if ( dofull ) slowFreq = fullElectFrequency;
3971  const BigReal slowstep = timestep * (staleForces?1:fullElectFrequency);
3972  int &doFullElectrostatics = patch->flags.doFullElectrostatics;
3973  doFullElectrostatics = (dofull && ((step >= numberOfSteps) || !(step%fullElectFrequency)));
3974  if ( dofull && (fullElectFrequency == 1) && !(simParams->mollyOn) )
3975  maxForceMerged = Results::slow;
3976  if ( doFullElectrostatics ) maxForceUsed = Results::slow;
3977 
3978 //#ifndef UPPER_BOUND
3979  const Bool accelMDOn = simParams->accelMDOn;
3980  const Bool accelMDdihe = simParams->accelMDdihe;
3981  const Bool accelMDdual = simParams->accelMDdual;
3982  if ( accelMDOn && (accelMDdihe || accelMDdual)) maxForceUsed = Results::amdf;
3983 
3984  // Is adaptive tempering on?
3985  const Bool adaptTempOn = simParams->adaptTempOn;
3987  if (simParams->langevinOn)
3989  else if (simParams->rescaleFreq > 0)
3991 
3992 
3993  int &doMolly = patch->flags.doMolly;
3994  doMolly = simParams->mollyOn && doFullElectrostatics;
3995  // BEGIN LA
3996  int &doLoweAndersen = patch->flags.doLoweAndersen;
3997  doLoweAndersen = simParams->loweAndersenOn && doNonbonded;
3998  // END LA
3999 
4000  int &doGBIS = patch->flags.doGBIS;
4001  doGBIS = simParams->GBISOn;
4002 
4003  int &doLCPO = patch->flags.doLCPO;
4004  doLCPO = simParams->LCPOOn;
4005 
4006  int zeroMomentum = simParams->zeroMomentum;
4007 
4008  // Do we need to return forces to TCL script or Colvar module?
4009  int doTcl = simParams->tclForcesOn;
4010  int doColvars = simParams->colvarsOn;
4011 //#endif
4012  int doGlobal = doTcl || doColvars;
4014 
4015  // Bother to calculate energies?
4016  int &doEnergy = patch->flags.doEnergy;
4017  int energyFrequency = simParams->computeEnergies;
4018 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4019  if(simParams->alchOn) energyFrequency = NAMD_gcd(energyFrequency, simParams->alchOutFreq);
4020 #endif
4021 #ifndef UPPER_BOUND
4022  const int reassignFreq = simParams->reassignFreq;
4023 #endif
4024 
4025  int &doVirial = patch->flags.doVirial;
4026  doVirial = 1;
4027 
4028  if ( scriptTask == SCRIPT_RUN ) {
4029 
4030 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4031 
4032 #ifndef UPPER_BOUND
4033 // printf("Doing initial rattle\n");
4034 #ifndef UPPER_BOUND
4035 D_MSG("rattle1()");
4036  TIMER_START(t, RATTLE1);
4037  rattle1(0.,0); // enforce rigid bond constraints on initial positions
4038  TIMER_STOP(t, RATTLE1);
4039 #endif
4040 
4043  patch->atom.begin(),patch->atom.end());
4044  }
4045 
4046  if ( !commOnly && ( reassignFreq>0 ) && ! (step%reassignFreq) ) {
4047  reassignVelocities(timestep,step);
4048  }
4049 #endif
4050 
4051  doEnergy = ! ( step % energyFrequency );
4052 #ifndef UPPER_BOUND
4053  if ( accelMDOn && !accelMDdihe ) doEnergy=1;
4054  //Update energy every timestep for adaptive tempering
4055  if ( adaptTempOn ) doEnergy=1;
4056 #endif
4057 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4058 D_MSG("runComputeObjects()");
4059  runComputeObjects(1,step<numberOfSteps); // must migrate here!
4060 #ifndef UPPER_BOUND
4061  rescaleaccelMD(step, doNonbonded, doFullElectrostatics); // for accelMD
4062  adaptTempUpdate(step); // update adaptive tempering temperature
4063 #endif
4064 
4065 #ifndef UPPER_BOUND
4066  if ( staleForces || doGlobal ) {
4067  if ( doNonbonded ) saveForce(Results::nbond);
4068  if ( doFullElectrostatics ) saveForce(Results::slow);
4069  }
4070 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4071  if ( ! commOnly ) {
4072 D_MSG("newtonianVelocities()");
4073  TIMER_START(t, KICK);
4074  newtonianVelocities(-0.5,timestep,nbondstep,slowstep,0,1,1);
4075  TIMER_STOP(t, KICK);
4076  }
4078 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4079 #ifndef UPPER_BOUND
4080 D_MSG("rattle1()");
4081  TIMER_START(t, RATTLE1);
4082  rattle1(-timestep,0);
4083  TIMER_STOP(t, RATTLE1);
4084 #endif
4085 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4086 D_MSG("submitHalfstep()");
4087  TIMER_START(t, SUBMITHALF);
4088  submitHalfstep(step);
4089  TIMER_STOP(t, SUBMITHALF);
4090 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4091  if ( ! commOnly ) {
4092 D_MSG("newtonianVelocities()");
4093  TIMER_START(t, KICK);
4094  newtonianVelocities(1.0,timestep,nbondstep,slowstep,0,1,1);
4095  TIMER_STOP(t, KICK);
4096  }
4097 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4098 D_MSG("rattle1()");
4099  TIMER_START(t, RATTLE1);
4100  rattle1(timestep,1);
4101  TIMER_STOP(t, RATTLE1);
4102  if (doGlobal) // include constraint forces
4103  computeGlobal->saveTotalForces(patch);
4104 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4105 D_MSG("submitHalfstep()");
4106  TIMER_START(t, SUBMITHALF);
4107  submitHalfstep(step);
4108  TIMER_STOP(t, SUBMITHALF);
4109  if ( zeroMomentum && doFullElectrostatics ) submitMomentum(step);
4110  if ( ! commOnly ) {
4111 D_MSG("newtonianVelocities()");
4112  TIMER_START(t, KICK);
4113  newtonianVelocities(-0.5,timestep,nbondstep,slowstep,0,1,1);
4114  TIMER_STOP(t, KICK);
4115  }
4116 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4117 #endif
4118 D_MSG("submitReductions()");
4119  TIMER_START(t, SUBMITFULL);
4120  submitReductions(step);
4121  TIMER_STOP(t, SUBMITFULL);
4122 // print_vel_AOS(patch->atom.begin(), 0, patch->numAtoms);
4123 #ifndef UPPER_BOUND
4124  if(0){ // if(traceIsOn()){
4125  traceUserEvent(eventEndOfTimeStep);
4126  sprintf(traceNote, "%s%d",tracePrefix,step);
4127  traceUserSuppliedNote(traceNote);
4128  }
4129 #endif
4130  rebalanceLoad(step);
4131 
4132  } // scriptTask == SCRIPT_RUN
4133 
4134 #ifndef UPPER_BOUND
4135  bool doMultigratorRattle = false;
4136 #endif
4137 
4138  //
4139  // DJH: There are a lot of mod operations below and elsewhere to
4140  // test step number against the frequency of something happening.
4141  // Mod and integer division are expensive!
4142  // Might be better to replace with counters and test equality.
4143  //
4144 #if 0
4145  for(int i = 0; i < NamdProfileEvent::EventsCount; i++)
4146  CkPrintf("-------------- [%d] %s -------------\n", i, NamdProfileEventStr[i]);
4147 #endif
4148 
4149 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
4150  int& eon = patch->flags.event_on;
4151  int epid = (simParams->beginEventPatchID <= patch->getPatchID()
4152  && patch->getPatchID() <= simParams->endEventPatchID);
4153  int beginStep = simParams->beginEventStep;
4154  int endStep = simParams->endEventStep;
4155  bool controlProfiling = patch->getPatchID() == 0;
4156 #endif
4157 
4158  for ( ++step; step <= numberOfSteps; ++step )
4159  {
4160 #if defined(NAMD_NVTX_ENABLED) || defined(NAMD_CMK_TRACE_ENABLED) || defined(NAMD_ROCTX_ENABLED)
4161  eon = epid && (beginStep < step && step <= endStep);
4162 
4163  if (controlProfiling && step == beginStep) {
4165  }
4166  if (controlProfiling && step == endStep) {
4168  }
4169  char buf[32];
4170  sprintf(buf, "%s: %d", NamdProfileEventStr[NamdProfileEvent::INTEGRATE_1], patch->getPatchID());
4171  NAMD_EVENT_START_EX(eon, NamdProfileEvent::INTEGRATE_1, buf);
4172 #endif
4173  DebugM(3,"for step "<<step<< " dGlobal " << doGlobal<<"\n"<<endi);
4174 #ifndef UPPER_BOUND
4175  rescaleVelocities(step);
4176  tcoupleVelocities(timestep,step);
4177  if ( simParams->stochRescaleOn ) {
4178  stochRescaleVelocities(step);
4179  }
4180  berendsenPressure(step);
4181 
4182  if ( ! commOnly ) {
4183  TIMER_START(t, KICK);
4184  newtonianVelocities(0.5,timestep,nbondstep,slowstep,staleForces,doNonbonded,doFullElectrostatics);
4185  TIMER_STOP(t, KICK);
4186  }
4187 
4188  // We do RATTLE here if multigrator thermostat was applied in the previous step
4189  if (doMultigratorRattle) rattle1(timestep, 1);
4190 
4191  /* reassignment based on half-step velocities
4192  if ( !commOnly && ( reassignFreq>0 ) && ! (step%reassignFreq) ) {
4193  addVelocityToPosition(0.5*timestep);
4194  reassignVelocities(timestep,step);
4195  addVelocityToPosition(0.5*timestep);
4196  rattle1(0.,0);
4197  rattle1(-timestep,0);
4198  addVelocityToPosition(-1.0*timestep);
4199  rattle1(timestep,0);
4200  } */
4201 
4202  TIMER_START(t, MAXMOVE);
4203  maximumMove(timestep);
4204  TIMER_STOP(t, MAXMOVE);
4205 
4206  NAMD_EVENT_STOP(eon, NamdProfileEvent::INTEGRATE_1); // integrate 1
4207 
4209  if ( ! commOnly ) {
4210  TIMER_START(t, DRIFT);
4211  addVelocityToPosition(0.5*timestep);
4212  TIMER_STOP(t, DRIFT);
4213  }
4214  // We add an Ornstein-Uhlenbeck integration step for the case of BAOAB (Langevin)
4215  langevinVelocities(timestep);
4216 
4217  // There is a blocking receive inside of langevinPiston()
4218  // that might suspend the current thread of execution,
4219  // so split profiling around this conditional block.
4220  langevinPiston(step);
4221 
4222  if ( ! commOnly ) {
4223  TIMER_START(t, DRIFT);
4224  addVelocityToPosition(0.5*timestep);
4225  TIMER_STOP(t, DRIFT);
4226  }
4227  } else {
4228  // If Langevin is not used, take full time step directly instread of two half steps
4229  if ( ! commOnly ) {
4230  TIMER_START(t, DRIFT);
4231  addVelocityToPosition(timestep);
4232  TIMER_STOP(t, DRIFT);
4233  }
4234  }
4235 
4236  NAMD_EVENT_START(eon, NamdProfileEvent::INTEGRATE_2);
4237 
4238  // impose hard wall potential for Drude bond length
4239  hardWallDrude(timestep, 1);
4240 
4242 #endif // UPPER_BOUND
4243 
4244  doNonbonded = !(step%nonbondedFrequency);
4245  doFullElectrostatics = (dofull && !(step%fullElectFrequency));
4246 
4247 #ifndef UPPER_BOUND
4248  if ( zeroMomentum && doFullElectrostatics ) {
4249  // There is a blocking receive inside of correctMomentum().
4250  correctMomentum(step,slowstep);
4251  }
4252 
4253  // There are NO sends in submitHalfstep() just local summation
4254  // into the Reduction struct.
4255  TIMER_START(t, SUBMITHALF);
4256  submitHalfstep(step);
4257  TIMER_STOP(t, SUBMITHALF);
4258 
4259  doMolly = simParams->mollyOn && doFullElectrostatics;
4260  // BEGIN LA
4261  doLoweAndersen = simParams->loweAndersenOn && doNonbonded;
4262  // END LA
4263 
4264  maxForceUsed = Results::normal;
4265  if ( doNonbonded ) maxForceUsed = Results::nbond;
4266  if ( doFullElectrostatics ) maxForceUsed = Results::slow;
4267  if ( accelMDOn && (accelMDdihe || accelMDdual)) maxForceUsed = Results::amdf;
4268 
4269  // Migrate Atoms on stepsPerCycle
4270  doEnergy = ! ( step % energyFrequency );
4271  if ( accelMDOn && !accelMDdihe ) doEnergy=1;
4272  if ( adaptTempOn ) doEnergy=1;
4273 
4274  // Multigrator
4275  if (simParams->multigratorOn) {
4276  doVirial = (!(step % energyFrequency) || ((simParams->outputPressure > 0) && !(step % simParams->outputPressure))
4277  || !(step % simParams->multigratorPressureFreq));
4278  doKineticEnergy = (!(step % energyFrequency) || !(step % simParams->multigratorTemperatureFreq));
4279  doMomenta = (simParams->outputMomenta > 0) && !(step % simParams->outputMomenta);
4280  } else {
4281  doVirial = 1;
4282  doKineticEnergy = 1;
4283  doMomenta = 1;
4284  }
4285 #endif
4286  NAMD_EVENT_STOP(eon, NamdProfileEvent::INTEGRATE_2); // integrate 2
4287 
4288  // The current thread of execution will suspend in runComputeObjects().
4289  runComputeObjects(!(step%stepsPerCycle),step<numberOfSteps);
4290 
4291  NAMD_EVENT_START(eon, NamdProfileEvent::INTEGRATE_3);
4292 
4293 #ifndef UPPER_BOUND
4294  rescaleaccelMD(step, doNonbonded, doFullElectrostatics); // for accelMD
4295 
4296  if ( staleForces || doGlobal ) {
4297  if ( doNonbonded ) saveForce(Results::nbond);
4298  if ( doFullElectrostatics ) saveForce(Results::slow);
4299  }
4300 
4301  // reassignment based on full-step velocities
4302  if ( !commOnly && ( reassignFreq>0 ) && ! (step%reassignFreq) ) {
4303  reassignVelocities(timestep,step);
4304  newtonianVelocities(-0.5,timestep,nbondstep,slowstep,staleForces,doNonbonded,doFullElectrostatics);
4305  rattle1(-timestep,0);
4306  }
4307 
4308  if ( ! commOnly ) {
4309  TIMER_START(t, VELBBK1);
4310  langevinVelocitiesBBK1(timestep);
4311  TIMER_STOP(t, VELBBK1);
4312  TIMER_START(t, KICK);
4313  newtonianVelocities(1.0,timestep,nbondstep,slowstep,staleForces,doNonbonded,doFullElectrostatics);
4314  TIMER_STOP(t, KICK);
4315  TIMER_START(t, VELBBK2);
4316  langevinVelocitiesBBK2(timestep);
4317  TIMER_STOP(t, VELBBK2);
4318  }
4319 
4320  // add drag to each atom's positions
4321  if ( ! commOnly && movDragOn ) addMovDragToPosition(timestep);
4322  if ( ! commOnly && rotDragOn ) addRotDragToPosition(timestep);
4323 
4324  TIMER_START(t, RATTLE1);
4325  rattle1(timestep,1);
4326  TIMER_STOP(t, RATTLE1);
4327  if (doGlobal) // include constraint forces
4328  computeGlobal->saveTotalForces(patch);
4329 
4330  TIMER_START(t, SUBMITHALF);
4331  submitHalfstep(step);
4332  TIMER_STOP(t, SUBMITHALF);
4333  if ( zeroMomentum && doFullElectrostatics ) submitMomentum(step);
4334 
4335  if ( ! commOnly ) {
4336  TIMER_START(t, KICK);
4337  newtonianVelocities(-0.5,timestep,nbondstep,slowstep,staleForces,doNonbonded,doFullElectrostatics);
4338  TIMER_STOP(t, KICK);
4339  }
4340 
4341  // rattle2(timestep,step);
4342 #endif
4343 
4344  TIMER_START(t, SUBMITFULL);
4345  submitReductions(step);
4346  TIMER_STOP(t, SUBMITFULL);
4347  TIMER_START(t, SUBMITCOLLECT);
4348  submitCollections(step);
4349  TIMER_STOP(t, SUBMITCOLLECT);
4350 #ifndef UPPER_BOUND
4351  //Update adaptive tempering temperature
4352  adaptTempUpdate(step);
4353 
4354  // Multigrator temperature and pressure steps
4355  multigratorTemperature(step, 1);
4356  multigratorPressure(step, 1);
4357  multigratorPressure(step, 2);
4358  multigratorTemperature(step, 2);
4359  doMultigratorRattle = (simParams->multigratorOn && !(step % simParams->multigratorTemperatureFreq));
4360 
4361  NAMD_EVENT_STOP(eon, NamdProfileEvent::INTEGRATE_3); // integrate 3
4362 #endif
4363 
4364 #if CYCLE_BARRIER
4365  cycleBarrier(!((step+1) % stepsPerCycle), step);
4366 #elif PME_BARRIER
4367  cycleBarrier(doFullElectrostatics, step);
4368 #elif STEP_BARRIER
4369  cycleBarrier(1, step);
4370 #endif
4371 
4372 #ifndef UPPER_BOUND
4373  if(Node::Object()->specialTracing || simParams->statsOn){
4374  int bstep = simParams->traceStartStep;
4375  int estep = bstep + simParams->numTraceSteps;
4376  if(step == bstep || step == estep){
4377  traceBarrier(step);
4378  }
4379  }
4380 
4381 #ifdef MEASURE_NAMD_WITH_PAPI
4382  if(simParams->papiMeasure) {
4383  int bstep = simParams->papiMeasureStartStep;
4384  int estep = bstep + simParams->numPapiMeasureSteps;
4385  if(step == bstep || step==estep) {
4386  papiMeasureBarrier(step);
4387  }
4388  }
4389 #endif
4390 
4391  if(0){ // if(traceIsOn()){
4392  traceUserEvent(eventEndOfTimeStep);
4393  sprintf(traceNote, "%s%d",tracePrefix,step);
4394  traceUserSuppliedNote(traceNote);
4395  }
4396 #endif // UPPER_BOUND
4397  rebalanceLoad(step);
4398 
4399 #if PME_BARRIER
4400  // a step before PME
4401  cycleBarrier(dofull && !((step+1)%fullElectFrequency),step);
4402 #endif
4403 
4404 #if USE_HPM
4405  if(step == START_HPM_STEP)
4406  (CProxy_Node(CkpvAccess(BOCclass_group).node)).startHPM();
4407 
4408  if(step == STOP_HPM_STEP)
4409  (CProxy_Node(CkpvAccess(BOCclass_group).node)).stopHPM();
4410 #endif
4411 
4412  }
4413 
4414  TIMER_DONE(t);
4415 #ifdef TIMER_COLLECTION
4416  if (patch->patchID == SPECIAL_PATCH_ID) {
4417  printf("Timer collection reporting in microseconds for "
4418  "Patch %d\n", patch->patchID);
4419  TIMER_REPORT(t);
4420  }
4421 #endif // TIMER_COLLECTION
4422  //
4423  // DJH: Copy updates of SOA back into AOS.
4424  //
4425  //patch->copy_updates_to_AOS();
4426 }
4427 
4428 // add moving drag to each atom's position
4430  FullAtom *atom = patch->atom.begin();
4431  int numAtoms = patch->numAtoms;
4432  Molecule *molecule = Node::Object()->molecule; // need its methods
4433  const BigReal movDragGlobVel = simParams->movDragGlobVel;
4434  const BigReal dt = timestep / TIMEFACTOR; // MUST be as in the integrator!
4435  Vector movDragVel, dragIncrement;
4436  for ( int i = 0; i < numAtoms; ++i )
4437  {
4438  // skip if fixed atom or zero drag attribute
4439  if ( (simParams->fixedAtomsOn && atom[i].atomFixed)
4440  || !(molecule->is_atom_movdragged(atom[i].id)) ) continue;
4441  molecule->get_movdrag_params(movDragVel, atom[i].id);
4442  dragIncrement = movDragGlobVel * movDragVel * dt;
4443  atom[i].position += dragIncrement;
4444  }
4445 }
4446 
4447 // add rotating drag to each atom's position
4449  FullAtom *atom = patch->atom.begin();
4450  int numAtoms = patch->numAtoms;
4451  Molecule *molecule = Node::Object()->molecule; // need its methods
4452  const BigReal rotDragGlobVel = simParams->rotDragGlobVel;
4453  const BigReal dt = timestep / TIMEFACTOR; // MUST be as in the integrator!
4454  BigReal rotDragVel, dAngle;
4455  Vector atomRadius;
4456  Vector rotDragAxis, rotDragPivot, dragIncrement;
4457  for ( int i = 0; i < numAtoms; ++i )
4458  {
4459  // skip if fixed atom or zero drag attribute
4460  if ( (simParams->fixedAtomsOn && atom[i].atomFixed)
4461  || !(molecule->is_atom_rotdragged(atom[i].id)) ) continue;
4462  molecule->get_rotdrag_params(rotDragVel, rotDragAxis, rotDragPivot, atom[i].id);
4463  dAngle = rotDragGlobVel * rotDragVel * dt;
4464  rotDragAxis /= rotDragAxis.length();
4465  atomRadius = atom[i].position - rotDragPivot;
4466  dragIncrement = cross(rotDragAxis, atomRadius) * dAngle;
4467  atom[i].position += dragIncrement;
4468  }
4469 }
4470 
4472  //
4473  // DJH: Copy all data into SOA (structure of arrays)
4474  // from AOS (array of structures) data structure.
4475  //
4476  //patch->copy_all_to_SOA();
4477 
4478  const int numberOfSteps = simParams->N;
4479  const int stepsPerCycle = simParams->stepsPerCycle;
4480 #if 0 && defined(NODEGROUP_FORCE_REGISTER)
4481  // XXX DJH: This is a hack that is found to get GPU nonbonded
4482  // force calculation right for --with-single-node-cuda builds
4483  const int stepsPerCycle_save = stepsPerCycle;
4484  simParams->stepsPerCycle = 1;
4485 #endif
4486  int &step = patch->flags.step;
4487  step = simParams->firstTimestep;
4488 
4489  int &maxForceUsed = patch->flags.maxForceUsed;
4490  int &maxForceMerged = patch->flags.maxForceMerged;
4491  maxForceUsed = Results::normal;
4492  maxForceMerged = Results::normal;
4493  int &doNonbonded = patch->flags.doNonbonded;
4494  doNonbonded = 1;
4495  maxForceUsed = Results::nbond;
4496  maxForceMerged = Results::nbond;
4497  const int dofull = ( simParams->fullElectFrequency ? 1 : 0 );
4498  int &doFullElectrostatics = patch->flags.doFullElectrostatics;
4499  doFullElectrostatics = dofull;
4500  if ( dofull ) {
4501  maxForceMerged = Results::slow;
4502  maxForceUsed = Results::slow;
4503  }
4504  int &doMolly = patch->flags.doMolly;
4505  doMolly = simParams->mollyOn && doFullElectrostatics;
4506  int &doMinimize = patch->flags.doMinimize;
4507  doMinimize = 1;
4508  // BEGIN LA
4509  int &doLoweAndersen = patch->flags.doLoweAndersen;
4510  doLoweAndersen = 0;
4511  // END LA
4512 
4513  int &doGBIS = patch->flags.doGBIS;
4514  doGBIS = simParams->GBISOn;
4515 
4516  int &doLCPO = patch->flags.doLCPO;
4517  doLCPO = simParams->LCPOOn;
4518 
4519  int doTcl = simParams->tclForcesOn;
4520  int doColvars = simParams->colvarsOn;
4521  int doGlobal = doTcl || doColvars;
4523 
4524  int &doEnergy = patch->flags.doEnergy;
4525  doEnergy = 1;
4526 
4527  // Do this to stabilize the minimizer, whether or not the user
4528  // wants rigid bond constraints enabled for dynamics.
4529  // In order to enforce, we have to call HomePatch::rattle1() directly.
4530  patch->rattle1(0.,0,0); // enforce rigid bond constraints on initial positions
4531 
4534  patch->atom.begin(),patch->atom.end());
4535  }
4536 
4537  runComputeObjects(1,step<numberOfSteps); // must migrate here!
4538 
4539  if ( doGlobal ) {
4540 #ifdef DEBUG_MINIMIZE
4541  printf("doTcl = %d doColvars = %d\n", doTcl, doColvars);
4542 #endif
4543  if ( doNonbonded ) saveForce(Results::nbond);
4544  if ( doFullElectrostatics ) saveForce(Results::slow);
4545  computeGlobal->saveTotalForces(patch);
4546  }
4547 #ifdef DEBUG_MINIMIZE
4548  else { printf("No computeGlobal\n"); }
4549 #endif
4550 
4552 
4553  submitMinimizeReductions(step,fmax2);
4554  rebalanceLoad(step);
4555 
4556  int downhill = 1; // start out just fixing bad contacts
4557  int minSeq = 0;
4558  for ( ++step; step <= numberOfSteps; ++step ) {
4559  // Blocking receive for the minimization coefficient.
4560  BigReal c = broadcast->minimizeCoefficient.get(minSeq++);
4561 
4562  if ( downhill ) {
4563  if ( c ) minimizeMoveDownhill(fmax2);
4564  else {
4565  downhill = 0;
4566  fmax2 *= 10000.;
4567  }
4568  }
4569  if ( ! downhill ) {
4570  if ( ! c ) { // new direction
4571 
4572  // Blocking receive for the minimization coefficient.
4573  c = broadcast->minimizeCoefficient.get(minSeq++);
4574 
4575  newMinimizeDirection(c); // v = c * v + f
4576 
4577  // Blocking receive for the minimization coefficient.
4578  c = broadcast->minimizeCoefficient.get(minSeq++);
4579 
4580  } // same direction
4581  newMinimizePosition(c); // x = x + c * v
4582  }
4583 
4584  runComputeObjects(!(step%stepsPerCycle),step<numberOfSteps);
4585  if ( doGlobal ) {
4586  if ( doNonbonded ) saveForce(Results::nbond);
4587  if ( doFullElectrostatics ) saveForce(Results::slow);
4588  computeGlobal->saveTotalForces(patch);
4589  }
4590  submitMinimizeReductions(step,fmax2);
4591  submitCollections(step, 1); // write out zeros for velocities
4592  rebalanceLoad(step);
4593  }
4594  quenchVelocities(); // zero out bogus velocity
4595 
4596  doMinimize = 0;
4597 
4598 #if 0
4599  // when using CUDASOAintegrate, need to update SOA data structures
4601  patch->copy_atoms_to_SOA();
4602  }
4603 #endif
4604 
4605 #if 0 && defined(NODEGROUP_FORCE_REGISTER)
4606  // XXX DJH: all patches in a PE are writing into simParams
4607  // so this hack needs a guard
4608  simParams->stepsPerCycle = stepsPerCycle_save;
4609 #endif
4610  //
4611  // DJH: Copy updates of SOA back into AOS.
4612  //
4613  //patch->copy_updates_to_AOS();
4614 }
4615 
4616 // x = x + 0.1 * unit(f) for large f
4618 
4619  FullAtom *a = patch->atom.begin();
4620  Force *f1 = patch->f[Results::normal].begin(); // includes nbond and slow
4621  int numAtoms = patch->numAtoms;
4622 
4623  for ( int i = 0; i < numAtoms; ++i ) {
4624  if ( simParams->fixedAtomsOn && a[i].atomFixed ) continue;
4625  Force f = f1[i];
4626  if ( f.length2() > fmax2 ) {
4627  a[i].position += ( 0.1 * f.unit() );
4628  int hgs = a[i].hydrogenGroupSize; // 0 if not parent
4629  for ( int j=1; j<hgs; ++j ) {
4630  a[++i].position += ( 0.1 * f.unit() );
4631  }
4632  }
4633  }
4634 
4635  patch->rattle1(0.,0,0);
4636 }
4637 
4638 // v = c * v + f
4640  FullAtom *a = patch->atom.begin();
4641  Force *f1 = patch->f[Results::normal].begin(); // includes nbond and slow
4642  const bool fixedAtomsOn = simParams->fixedAtomsOn;
4643  const bool drudeHardWallOn = simParams->drudeHardWallOn;
4644  int numAtoms = patch->numAtoms;
4645  BigReal maxv2 = 0.;
4646 
4647  for ( int i = 0; i < numAtoms; ++i ) {
4648  a[i].velocity *= c;
4649  a[i].velocity += f1[i];
4650  if ( drudeHardWallOn && i && (0.05 < a[i].mass) && ((a[i].mass < 1.0)) ) { // drude particle
4651  a[i].velocity = a[i-1].velocity;
4652  }
4653  if ( fixedAtomsOn && a[i].atomFixed ) a[i].velocity = 0;
4654  BigReal v2 = a[i].velocity.length2();
4655  if ( v2 > maxv2 ) maxv2 = v2;
4656  }
4657 
4658  { Tensor virial; patch->minimize_rattle2( 0.1 * TIMEFACTOR / sqrt(maxv2), &virial); }
4659 
4660  maxv2 = 0.;
4661  for ( int i = 0; i < numAtoms; ++i ) {
4662  if ( drudeHardWallOn && i && (0.05 < a[i].mass) && ((a[i].mass < 1.0)) ) { // drude particle
4663  a[i].velocity = a[i-1].velocity;
4664  }
4665  if ( fixedAtomsOn && a[i].atomFixed ) a[i].velocity = 0;
4666  BigReal v2 = a[i].velocity.length2();
4667  if ( v2 > maxv2 ) maxv2 = v2;
4668  }
4669 
4670  min_reduction->max(0,maxv2);
4671  min_reduction->submit();
4672 
4673  // prevent hydrogens from being left behind
4674  BigReal fmax2 = 0.01 * TIMEFACTOR * TIMEFACTOR * TIMEFACTOR * TIMEFACTOR;
4675  // int adjustCount = 0;
4676  int hgs;
4677  for ( int i = 0; i < numAtoms; i += hgs ) {
4678  hgs = a[i].hydrogenGroupSize;
4679  BigReal minChildVel = a[i].velocity.length2();
4680  if ( minChildVel < fmax2 ) continue;
4681  int adjustChildren = 1;
4682  for ( int j = i+1; j < (i+hgs); ++j ) {
4683  if ( a[j].velocity.length2() > minChildVel ) adjustChildren = 0;
4684  }
4685  if ( adjustChildren ) {
4686  // if ( hgs > 1 ) ++adjustCount;
4687  for ( int j = i+1; j < (i+hgs); ++j ) {
4688  if (a[i].mass < 0.01) continue; // lone pair
4689  a[j].velocity = a[i].velocity;
4690  }
4691  }
4692  }
4693  // if (adjustCount) CkPrintf("Adjusting %d hydrogen groups\n", adjustCount);
4694 
4695 }
4696 
4697 // x = x + c * v
4699  FullAtom *a = patch->atom.begin();
4700  int numAtoms = patch->numAtoms;
4701 
4702  for ( int i = 0; i < numAtoms; ++i ) {
4703  a[i].position += c * a[i].velocity;
4704  }
4705 
4706  if ( simParams->drudeHardWallOn ) {
4707  for ( int i = 1; i < numAtoms; ++i ) {
4708  if ( (0.05 < a[i].mass) && ((a[i].mass < 1.0)) ) { // drude particle
4709  a[i].position -= a[i-1].position;
4710  }
4711  }
4712  }
4713 
4714  patch->rattle1(0.,0,0);
4715 
4716  if ( simParams->drudeHardWallOn ) {
4717  for ( int i = 1; i < numAtoms; ++i ) {
4718  if ( (0.05 < a[i].mass) && ((a[i].mass < 1.0)) ) { // drude particle
4719  a[i].position += a[i-1].position;
4720  }
4721  }
4722  }
4723 }
4724 
4725 // v = 0
4727  FullAtom *a = patch->atom.begin();
4728  int numAtoms = patch->numAtoms;
4729 
4730  for ( int i = 0; i < numAtoms; ++i ) {
4731  a[i].velocity = 0;
4732  }
4733 }
4734 
4736 
4737  FullAtom *a = patch->atom.begin();
4738  const int numAtoms = patch->numAtoms;
4739 
4740  Vector momentum = 0;
4741  BigReal mass = 0;
4742 if ( simParams->zeroMomentumAlt ) {
4743  for ( int i = 0; i < numAtoms; ++i ) {
4744  momentum += a[i].mass * a[i].velocity;
4745  mass += 1.;
4746  }
4747 } else {
4748  for ( int i = 0; i < numAtoms; ++i ) {
4749  momentum += a[i].mass * a[i].velocity;
4750  mass += a[i].mass;
4751  }
4752 }
4753 
4754  ADD_VECTOR_OBJECT(reduction,REDUCTION_HALFSTEP_MOMENTUM,momentum);
4756 }
4757 
4758 void Sequencer::correctMomentum(int step, BigReal drifttime) {
4759 
4760  //
4761  // DJH: This test should be done in SimParameters.
4762  //
4763  if ( simParams->fixedAtomsOn )
4764  NAMD_die("Cannot zero momentum when fixed atoms are present.");
4765 
4766  // Blocking receive for the momentum correction vector.
4767  const Vector dv = broadcast->momentumCorrection.get(step);
4768 
4769  const Vector dx = dv * ( drifttime / TIMEFACTOR );
4770 
4771  FullAtom *a = patch->atom.begin();
4772  const int numAtoms = patch->numAtoms;
4773 
4774 if ( simParams->zeroMomentumAlt ) {
4775  for ( int i = 0; i < numAtoms; ++i ) {
4776  a[i].velocity += dv * a[i].recipMass;
4777  a[i].position += dx * a[i].recipMass;
4778  }
4779 } else {
4780  for ( int i = 0; i < numAtoms; ++i ) {
4781  a[i].velocity += dv;
4782  a[i].position += dx;
4783  }
4784 }
4785 
4786 }
4787 
4788 // --------- For Multigrator ---------
4789 void Sequencer::scalePositionsVelocities(const Tensor& posScale, const Tensor& velScale) {
4790  FullAtom *a = patch->atom.begin();
4791  int numAtoms = patch->numAtoms;
4792  Position origin = patch->lattice.origin();
4793  if ( simParams->fixedAtomsOn ) {
4794  NAMD_bug("Sequencer::scalePositionsVelocities, fixed atoms not implemented");
4795  }
4796  if ( simParams->useGroupPressure ) {
4797  int hgs;
4798  for ( int i = 0; i < numAtoms; i += hgs ) {
4799  hgs = a[i].hydrogenGroupSize;
4800  Position pos_cm(0.0, 0.0, 0.0);
4801  Velocity vel_cm(0.0, 0.0, 0.0);
4802  BigReal m_cm = 0.0;
4803  for (int j=0;j < hgs;++j) {
4804  m_cm += a[i+j].mass;
4805  pos_cm += a[i+j].mass*a[i+j].position;
4806  vel_cm += a[i+j].mass*a[i+j].velocity;
4807  }
4808  pos_cm /= m_cm;
4809  vel_cm /= m_cm;
4810  pos_cm -= origin;
4811  Position dpos = posScale*pos_cm;
4812  Velocity dvel = velScale*vel_cm;
4813  for (int j=0;j < hgs;++j) {
4814  a[i+j].position += dpos;
4815  a[i+j].velocity += dvel;
4816  }
4817  }
4818  } else {
4819  for ( int i = 0; i < numAtoms; i++) {
4820  a[i].position += posScale*(a[i].position-origin);
4821  a[i].velocity = velScale*a[i].velocity;
4822  }
4823  }
4824 }
4825 
4826 void Sequencer::multigratorPressure(int step, int callNumber) {
4827 // Calculate new positions, momenta, and volume using positionRescaleFactor and
4828 // velocityRescaleTensor values returned from Controller::multigratorPressureCalcScale()
4830  FullAtom *a = patch->atom.begin();
4831  int numAtoms = patch->numAtoms;
4832 
4833  // Blocking receive (get) scaling factors from Controller
4834  Tensor scaleTensor = (callNumber == 1) ? broadcast->positionRescaleFactor.get(step) : broadcast->positionRescaleFactor2.get(step);
4835  Tensor velScaleTensor = (callNumber == 1) ? broadcast->velocityRescaleTensor.get(step) : broadcast->velocityRescaleTensor2.get(step);
4836  Tensor posScaleTensor = scaleTensor;
4837  posScaleTensor -= Tensor::identity();
4838  if (simParams->useGroupPressure) {
4839  velScaleTensor -= Tensor::identity();
4840  }
4841 
4842  // Scale volume
4843  patch->lattice.rescale(scaleTensor);
4844  // Scale positions and velocities
4845  scalePositionsVelocities(posScaleTensor, velScaleTensor);
4846 
4847  if (!patch->flags.doFullElectrostatics) NAMD_bug("Sequencer::multigratorPressure, doFullElectrostatics must be true");
4848 
4849  // Calculate new forces
4850  // NOTE: We should not need to migrate here since any migration should have happened in the
4851  // previous call to runComputeObjects inside the MD loop in Sequencer::integrate()
4852  const int numberOfSteps = simParams->N;
4853  const int stepsPerCycle = simParams->stepsPerCycle;
4854  runComputeObjects(0 , step<numberOfSteps, 1);
4855 
4856  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtoms;
4858 
4859  // Virials etc.
4860  Tensor virialNormal;
4861  Tensor momentumSqrSum;
4862  BigReal kineticEnergy = 0;
4863  if ( simParams->pairInteractionOn ) {
4864  if ( simParams->pairInteractionSelf ) {
4865  for ( int i = 0; i < numAtoms; ++i ) {
4866  if ( a[i].partition != 1 ) continue;
4867  kineticEnergy += a[i].mass * a[i].velocity.length2();
4868  virialNormal.outerAdd(a[i].mass, a[i].velocity, a[i].velocity);
4869  }
4870  }
4871  } else {
4872  for ( int i = 0; i < numAtoms; ++i ) {
4873  if (a[i].mass < 0.01) continue;
4874  kineticEnergy += a[i].mass * a[i].velocity.length2();
4875  virialNormal.outerAdd(a[i].mass, a[i].velocity, a[i].velocity);
4876  }
4877  }
4878  if (!simParams->useGroupPressure) momentumSqrSum = virialNormal;
4879  kineticEnergy *= 0.5;
4881  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, virialNormal);
4882 
4883  if ( simParams->fixedAtomsOn ) {
4884  Tensor fixVirialNormal;
4885  Tensor fixVirialNbond;
4886  Tensor fixVirialSlow;
4887  Vector fixForceNormal = 0;
4888  Vector fixForceNbond = 0;
4889  Vector fixForceSlow = 0;
4890 
4891  calcFixVirial(fixVirialNormal, fixVirialNbond, fixVirialSlow, fixForceNormal, fixForceNbond, fixForceSlow);
4892 
4893  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
4894  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
4895  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
4896  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
4897  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
4898  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
4899  }
4900 
4901  // Internal virial and group momentum
4902  Tensor intVirialNormal;
4903  Tensor intVirialNormal2;
4904  Tensor intVirialNbond;
4905  Tensor intVirialSlow;
4906  int hgs;
4907  for ( int i = 0; i < numAtoms; i += hgs ) {
4908  hgs = a[i].hydrogenGroupSize;
4909  int j;
4910  BigReal m_cm = 0;
4911  Position x_cm(0,0,0);
4912  Velocity v_cm(0,0,0);
4913  for ( j = i; j < (i+hgs); ++j ) {
4914  m_cm += a[j].mass;
4915  x_cm += a[j].mass * a[j].position;
4916  v_cm += a[j].mass * a[j].velocity;
4917  }
4918  if (simParams->useGroupPressure) momentumSqrSum.outerAdd(1.0/m_cm, v_cm, v_cm);
4919  x_cm /= m_cm;
4920  v_cm /= m_cm;
4921  if (simParams->fixedAtomsOn) NAMD_bug("Sequencer::multigratorPressure, simParams->fixedAtomsOn not implemented yet");
4922  if ( simParams->pairInteractionOn ) {
4923  if ( simParams->pairInteractionSelf ) {
4924  NAMD_bug("Sequencer::multigratorPressure, this part needs to be implemented correctly");
4925  for ( j = i; j < (i+hgs); ++j ) {
4926  if ( a[j].partition != 1 ) continue;
4927  BigReal mass = a[j].mass;
4928  Vector v = a[j].velocity;
4929  Vector dv = v - v_cm;
4930  intVirialNormal2.outerAdd (mass, v, dv);
4931  Vector dx = a[j].position - x_cm;
4932  intVirialNormal.outerAdd(1.0, patch->f[Results::normal][j], dx);
4933  intVirialNbond.outerAdd(1.0, patch->f[Results::nbond][j], dx);
4934  intVirialSlow.outerAdd(1.0, patch->f[Results::slow][j], dx);
4935  }
4936  }
4937  } else {
4938  for ( j = i; j < (i+hgs); ++j ) {
4939  BigReal mass = a[j].mass;
4940  Vector v = a[j].velocity;
4941  Vector dv = v - v_cm;
4942  intVirialNormal2.outerAdd(mass, v, dv);
4943  Vector dx = a[j].position - x_cm;
4944  intVirialNormal.outerAdd(1.0, patch->f[Results::normal][j], dx);
4945  intVirialNbond.outerAdd(1.0, patch->f[Results::nbond][j], dx);
4946  intVirialSlow.outerAdd(1.0, patch->f[Results::slow][j], dx);
4947  }
4948  }
4949  }
4950 
4951  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, intVirialNormal);
4952  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, intVirialNormal2);
4953  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, intVirialNbond);
4954  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, intVirialSlow);
4955  ADD_TENSOR_OBJECT(reduction, REDUCTION_MOMENTUM_SQUARED, momentumSqrSum);
4956 
4957  reduction->submit();
4958  }
4959 }
4960 
4961 void Sequencer::scaleVelocities(const BigReal velScale) {
4962  FullAtom *a = patch->atom.begin();
4963  int numAtoms = patch->numAtoms;
4964  for ( int i = 0; i < numAtoms; i++) {
4965  a[i].velocity *= velScale;
4966  }
4967 }
4968 
4970  FullAtom *a = patch->atom.begin();
4971  int numAtoms = patch->numAtoms;
4972  BigReal kineticEnergy = 0.0;
4973  if ( simParams->pairInteractionOn ) {
4974  if ( simParams->pairInteractionSelf ) {
4975  for (int i = 0; i < numAtoms; ++i ) {
4976  if ( a[i].partition != 1 ) continue;
4977  kineticEnergy += a[i].mass * a[i].velocity.length2();
4978  }
4979  }
4980  } else {
4981  for (int i = 0; i < numAtoms; ++i ) {
4982  kineticEnergy += a[i].mass * a[i].velocity.length2();
4983  }
4984  }
4985  kineticEnergy *= 0.5;
4986  return kineticEnergy;
4987 }
4988 
4989 void Sequencer::multigratorTemperature(int step, int callNumber) {
4991  // Blocking receive (get) velocity scaling factor.
4992  BigReal velScale = (callNumber == 1) ? broadcast->velocityRescaleFactor.get(step) : broadcast->velocityRescaleFactor2.get(step);
4993  scaleVelocities(velScale);
4994  // Calculate new kineticEnergy
4995  BigReal kineticEnergy = calcKineticEnergy();
4997  if (callNumber == 1 && !(step % simParams->multigratorPressureFreq)) {
4998  // If this is a pressure cycle, calculate new momentum squared sum
4999  FullAtom *a = patch->atom.begin();
5000  int numAtoms = patch->numAtoms;
5001  Tensor momentumSqrSum;
5002  if (simParams->useGroupPressure) {
5003  int hgs;
5004  for ( int i = 0; i < numAtoms; i += hgs ) {
5005  hgs = a[i].hydrogenGroupSize;
5006  int j;
5007  BigReal m_cm = 0;
5008  Position x_cm(0,0,0);
5009  Velocity v_cm(0,0,0);
5010  for ( j = i; j < (i+hgs); ++j ) {
5011  m_cm += a[j].mass;
5012  x_cm += a[j].mass * a[j].position;
5013  v_cm += a[j].mass * a[j].velocity;
5014  }
5015  momentumSqrSum.outerAdd(1.0/m_cm, v_cm, v_cm);
5016  }
5017  } else {
5018  for ( int i = 0; i < numAtoms; i++) {
5019  momentumSqrSum.outerAdd(a[i].mass, a[i].velocity, a[i].velocity);
5020  }
5021  }
5022  ADD_TENSOR_OBJECT(multigratorReduction, MULTIGRATOR_REDUCTION_MOMENTUM_SQUARED, momentumSqrSum);
5023  }
5024  // Submit reductions (kineticEnergy and, if applicable, momentumSqrSum)
5026 
5027  }
5028 }
5029 // --------- End Multigrator ---------
5030 
5031 //
5032 // DJH: Calls one or more addForceToMomentum which in turn calls HomePatch
5033 // versions. We should inline to reduce the number of function calls.
5034 //
5035 void Sequencer::newtonianVelocities(BigReal stepscale, const BigReal timestep,
5036  const BigReal nbondstep,
5037  const BigReal slowstep,
5038  const int staleForces,
5039  const int doNonbonded,
5040  const int doFullElectrostatics)
5041 {
5042  NAMD_EVENT_RANGE_2(patch->flags.event_on,
5043  NamdProfileEvent::NEWTONIAN_VELOCITIES);
5044 
5045  // Deterministic velocity update, account for multigrator
5046  if (staleForces || (doNonbonded && doFullElectrostatics)) {
5047  addForceToMomentum3(stepscale*timestep, Results::normal, 0,
5048  stepscale*nbondstep, Results::nbond, staleForces,
5049  stepscale*slowstep, Results::slow, staleForces);
5050  } else {
5051  addForceToMomentum(stepscale*timestep);
5052  if (staleForces || doNonbonded)
5053  addForceToMomentum(stepscale*nbondstep, Results::nbond, staleForces);
5054  if (staleForces || doFullElectrostatics)
5055  addForceToMomentum(stepscale*slowstep, Results::slow, staleForces);
5056  }
5057 }
5058 
5060 {
5061 // This routine is used for the BAOAB integrator,
5062 // Ornstein-Uhlenbeck exact solve for the O-part.
5063 // See B. Leimkuhler and C. Matthews, AMRX (2012)
5064 // Routine originally written by JPhillips, with fresh errors by CMatthews June2012
5065 
5067  {
5068  FullAtom *a = patch->atom.begin();
5069  int numAtoms = patch->numAtoms;
5070  Molecule *molecule = Node::Object()->molecule;
5071  BigReal dt = dt_fs * 0.001; // convert to ps
5074  {
5075  kbT = BOLTZMANN*adaptTempT;
5076  }
5077 
5078  int lesReduceTemp = simParams->lesOn && simParams->lesReduceTemp;
5079  BigReal tempFactor = lesReduceTemp ? 1.0 / simParams->lesFactor : 1.0;
5080 
5081  for ( int i = 0; i < numAtoms; ++i )
5082  {
5083  BigReal dt_gamma = dt * a[i].langevinParam;
5084  if ( ! dt_gamma ) continue;
5085 
5086  BigReal f1 = exp( -dt_gamma );
5087  BigReal f2 = sqrt( ( 1. - f1*f1 ) * kbT *
5088  ( a[i].partition ? tempFactor : 1.0 ) *
5089  a[i].recipMass );
5090  a[i].velocity *= f1;
5091  a[i].velocity += f2 * random->gaussian_vector();
5092  }
5093  }
5094 }
5095 
5097 {
5098  NAMD_EVENT_RANGE_2(patch->flags.event_on,
5099  NamdProfileEvent::LANGEVIN_VELOCITIES_BBK1);
5101  {
5102  FullAtom *a = patch->atom.begin();
5103  int numAtoms = patch->numAtoms;
5104  Molecule *molecule = Node::Object()->molecule;
5105  BigReal dt = dt_fs * 0.001; // convert to ps
5106  int i;
5107 
5108  if (simParams->drudeOn) {
5109  for (i = 0; i < numAtoms; i++) {
5110 
5111  if (i < numAtoms-1 &&
5112  a[i+1].mass < 1.0 && a[i+1].mass > 0.05) {
5113  //printf("*** Found Drude particle %d\n", a[i+1].id);
5114  // i+1 is a Drude particle with parent i
5115 
5116  // convert from Cartesian coordinates to (COM,bond) coordinates
5117  BigReal m = a[i+1].mass / (a[i].mass + a[i+1].mass); // mass ratio
5118  Vector v_bnd = a[i+1].velocity - a[i].velocity; // vel of bond
5119  Vector v_com = a[i].velocity + m * v_bnd; // vel of COM
5120  BigReal dt_gamma;
5121 
5122  // use Langevin damping factor i for v_com
5123  dt_gamma = dt * a[i].langevinParam;
5124  if (dt_gamma != 0.0) {
5125  v_com *= ( 1. - 0.5 * dt_gamma );
5126  }
5127 
5128  // use Langevin damping factor i+1 for v_bnd
5129  dt_gamma = dt * a[i+1].langevinParam;
5130  if (dt_gamma != 0.0) {
5131  v_bnd *= ( 1. - 0.5 * dt_gamma );
5132  }
5133 
5134  // convert back
5135  a[i].velocity = v_com - m * v_bnd;
5136  a[i+1].velocity = v_bnd + a[i].velocity;
5137 
5138  i++; // +1 from loop, we've updated both particles
5139  }
5140  else {
5141  BigReal dt_gamma = dt * a[i].langevinParam;
5142  if ( ! dt_gamma ) continue;
5143 
5144  a[i].velocity *= ( 1. - 0.5 * dt_gamma );
5145  }
5146 
5147  } // end for
5148  } // end if drudeOn
5149  else {
5150 
5151  //
5152  // DJH: The conditional inside loop prevents vectorization and doesn't
5153  // avoid much work since addition and multiplication are cheap.
5154  //
5155  for ( i = 0; i < numAtoms; ++i )
5156  {
5157  BigReal dt_gamma = dt * a[i].langevinParam;
5158  if ( ! dt_gamma ) continue;
5159 
5160  a[i].velocity *= ( 1. - 0.5 * dt_gamma );
5161  }
5162 
5163  } // end else
5164 
5165  } // end if langevinOn
5166 }
5167 
5168 
5170 {
5171  NAMD_EVENT_RANGE_2(patch->flags.event_on,
5172  NamdProfileEvent::LANGEVIN_VELOCITIES_BBK2);
5174  {
5175  //
5176  // DJH: This call is expensive. Avoid calling when gammas don't differ.
5177  // Set flag in SimParameters and make this call conditional.
5178  //
5179  TIMER_START(patch->timerSet, RATTLE1);
5180  rattle1(dt_fs,1); // conserve momentum if gammas differ
5181  TIMER_STOP(patch->timerSet, RATTLE1);
5182 
5183  FullAtom *a = patch->atom.begin();
5184  int numAtoms = patch->numAtoms;
5185  Molecule *molecule = Node::Object()->molecule;
5186  BigReal dt = dt_fs * 0.001; // convert to ps
5189  {
5190  kbT = BOLTZMANN*adaptTempT;
5191  }
5192  int lesReduceTemp = simParams->lesOn && simParams->lesReduceTemp;
5193  BigReal tempFactor = lesReduceTemp ? 1.0 / simParams->lesFactor : 1.0;
5194  int i;
5195 
5196  if (simParams->drudeOn) {
5197  BigReal kbT_bnd = BOLTZMANN*(simParams->drudeTemp); // drude bond Temp
5198 
5199  for (i = 0; i < numAtoms; i++) {
5200 
5201  if (i < numAtoms-1 &&
5202  a[i+1].mass < 1.0 && a[i+1].mass > 0.05) {
5203  //printf("*** Found Drude particle %d\n", a[i+1].id);
5204  // i+1 is a Drude particle with parent i
5205 
5206  // convert from Cartesian coordinates to (COM,bond) coordinates
5207  BigReal m = a[i+1].mass / (a[i].mass + a[i+1].mass); // mass ratio
5208  Vector v_bnd = a[i+1].velocity - a[i].velocity; // vel of bond
5209  Vector v_com = a[i].velocity + m * v_bnd; // vel of COM
5210  BigReal dt_gamma;
5211 
5212  // use Langevin damping factor i for v_com
5213  dt_gamma = dt * a[i].langevinParam;
5214  if (dt_gamma != 0.0) {
5215  BigReal mass = a[i].mass + a[i+1].mass;
5216  v_com += random->gaussian_vector() *
5217  sqrt( 2 * dt_gamma * kbT *
5218  ( a[i].partition ? tempFactor : 1.0 ) / mass );
5219  v_com /= ( 1. + 0.5 * dt_gamma );
5220  }
5221 
5222  // use Langevin damping factor i+1 for v_bnd
5223  dt_gamma = dt * a[i+1].langevinParam;
5224  if (dt_gamma != 0.0) {
5225  BigReal mass = a[i+1].mass * (1. - m);
5226  v_bnd += random->gaussian_vector() *
5227  sqrt( 2 * dt_gamma * kbT_bnd *
5228  ( a[i+1].partition ? tempFactor : 1.0 ) / mass );
5229  v_bnd /= ( 1. + 0.5 * dt_gamma );
5230  }
5231 
5232  // convert back
5233  a[i].velocity = v_com - m * v_bnd;
5234  a[i+1].velocity = v_bnd + a[i].velocity;
5235 
5236  i++; // +1 from loop, we've updated both particles
5237  }
5238  else {
5239  BigReal dt_gamma = dt * a[i].langevinParam;
5240  if ( ! dt_gamma ) continue;
5241 
5242  a[i].velocity += random->gaussian_vector() *
5243  sqrt( 2 * dt_gamma * kbT *
5244  ( a[i].partition ? tempFactor : 1.0 ) * a[i].recipMass );
5245  a[i].velocity /= ( 1. + 0.5 * dt_gamma );
5246  }
5247 
5248  } // end for
5249  } // end if drudeOn
5250  else {
5251 
5252  //
5253  // DJH: For case using same gamma (the Langevin parameter),
5254  // no partitions (e.g. FEP), and no adaptive tempering (adaptTempMD),
5255  // we can precompute constants. Then by lifting the RNG from the
5256  // loop (filling up an array of random numbers), we can vectorize
5257  // loop and simplify arithmetic to just addition and multiplication.
5258  //
5259  for ( i = 0; i < numAtoms; ++i )
5260  {
5261  BigReal dt_gamma = dt * a[i].langevinParam;
5262  if ( ! dt_gamma ) continue;
5263 
5264  a[i].velocity += random->gaussian_vector() *
5265  sqrt( 2 * dt_gamma * kbT *
5266  ( a[i].partition ? tempFactor : 1.0 ) * a[i].recipMass );
5267  a[i].velocity /= ( 1. + 0.5 * dt_gamma );
5268  }
5269 
5270  } // end else
5271 
5272  } // end if langevinOn
5273 }
5274 
5275 
5277 {
5278  if ( simParams->berendsenPressureOn ) {
5280  const int freq = simParams->berendsenPressureFreq;
5281  if ( ! (berendsenPressure_count % freq ) ) {
5283  FullAtom *a = patch->atom.begin();
5284  int numAtoms = patch->numAtoms;
5285  // Blocking receive for the updated lattice scaling factor.
5286  Tensor factor = broadcast->positionRescaleFactor.get(step);
5287  patch->lattice.rescale(factor);
5288  if ( simParams->useGroupPressure )
5289  {
5290  int hgs;
5291  for ( int i = 0; i < numAtoms; i += hgs ) {
5292  int j;
5293  hgs = a[i].hydrogenGroupSize;
5294  if ( simParams->fixedAtomsOn && a[i].groupFixed ) {
5295  for ( j = i; j < (i+hgs); ++j ) {
5297  a[j].fixedPosition,a[j].transform);
5298  }
5299  continue;
5300  }
5301  BigReal m_cm = 0;
5302  Position x_cm(0,0,0);
5303  for ( j = i; j < (i+hgs); ++j ) {
5304  if ( simParams->fixedAtomsOn && a[j].atomFixed ) continue;
5305  m_cm += a[j].mass;
5306  x_cm += a[j].mass * a[j].position;
5307  }
5308  x_cm /= m_cm;
5309  Position new_x_cm = x_cm;
5310  patch->lattice.rescale(new_x_cm,factor);
5311  Position delta_x_cm = new_x_cm - x_cm;
5312  for ( j = i; j < (i+hgs); ++j ) {
5313  if ( simParams->fixedAtomsOn && a[j].atomFixed ) {
5315  a[j].fixedPosition,a[j].transform);
5316  continue;
5317  }
5318  a[j].position += delta_x_cm;
5319  }
5320  }
5321  }
5322  else
5323  {
5324  for ( int i = 0; i < numAtoms; ++i )
5325  {
5326  if ( simParams->fixedAtomsOn && a[i].atomFixed ) {
5328  a[i].fixedPosition,a[i].transform);
5329  continue;
5330  }
5331  patch->lattice.rescale(a[i].position,factor);
5332  }
5333  }
5334  }
5335  } else {
5337  }
5338 }
5339 
5341 {
5342  if ( simParams->langevinPistonOn && ! ( (step-1-slowFreq/2) % slowFreq ) )
5343  {
5344  //
5345  // DJH: Loops below simplify if we lift out special cases of fixed atoms
5346  // and pressure excluded atoms and make them their own branch.
5347  //
5348  FullAtom *a = patch->atom.begin();
5349  int numAtoms = patch->numAtoms;
5350  // Blocking receive for the updated lattice scaling factor.
5351  Tensor factor = broadcast->positionRescaleFactor.get(step);
5352  TIMER_START(patch->timerSet, PISTON);
5353  // JCP FIX THIS!!!
5354  Vector velFactor(1/factor.xx,1/factor.yy,1/factor.zz);
5355  patch->lattice.rescale(factor);
5356  Molecule *mol = Node::Object()->molecule;
5357  if ( simParams->useGroupPressure )
5358  {
5359  int hgs;
5360  for ( int i = 0; i < numAtoms; i += hgs ) {
5361  int j;
5362  hgs = a[i].hydrogenGroupSize;
5363  if ( simParams->fixedAtomsOn && a[i].groupFixed ) {
5364  for ( j = i; j < (i+hgs); ++j ) {
5366  a[j].fixedPosition,a[j].transform);
5367  }
5368  continue;
5369  }
5370  BigReal m_cm = 0;
5371  Position x_cm(0,0,0);
5372  Velocity v_cm(0,0,0);
5373  for ( j = i; j < (i+hgs); ++j ) {
5374  if ( simParams->fixedAtomsOn && a[j].atomFixed ) continue;
5375  m_cm += a[j].mass;
5376  x_cm += a[j].mass * a[j].position;
5377  v_cm += a[j].mass * a[j].velocity;
5378  }
5379  x_cm /= m_cm;
5380  Position new_x_cm = x_cm;
5381  patch->lattice.rescale(new_x_cm,factor);
5382  Position delta_x_cm = new_x_cm - x_cm;
5383  v_cm /= m_cm;
5384  Velocity delta_v_cm;
5385  delta_v_cm.x = ( velFactor.x - 1 ) * v_cm.x;
5386  delta_v_cm.y = ( velFactor.y - 1 ) * v_cm.y;
5387  delta_v_cm.z = ( velFactor.z - 1 ) * v_cm.z;
5388  for ( j = i; j < (i+hgs); ++j ) {
5389  if ( simParams->fixedAtomsOn && a[j].atomFixed ) {
5391  a[j].fixedPosition,a[j].transform);
5392  continue;
5393  }
5394  if ( mol->is_atom_exPressure(a[j].id) ) continue;
5395  a[j].position += delta_x_cm;
5396  a[j].velocity += delta_v_cm;
5397  }
5398  }
5399  }
5400  else
5401  {
5402  for ( int i = 0; i < numAtoms; ++i )
5403  {
5404  if ( simParams->fixedAtomsOn && a[i].atomFixed ) {
5406  a[i].fixedPosition,a[i].transform);
5407  continue;
5408  }
5409  if ( mol->is_atom_exPressure(a[i].id) ) continue;
5410  patch->lattice.rescale(a[i].position,factor);
5411  a[i].velocity.x *= velFactor.x;
5412  a[i].velocity.y *= velFactor.y;
5413  a[i].velocity.z *= velFactor.z;
5414  }
5415  }
5416  TIMER_STOP(patch->timerSet, PISTON);
5417  }
5418 }
5419