NAMD
SequencerCUDA.C
Go to the documentation of this file.
1 #include "ComputeLonepairsCUDA.h"
2 #include "CudaUtils.h"
3 #include "Molecule.h"
4 #include "ReductionMgr.h"
5 #include "SequencerCUDA.h"
6 #include "ComputeNonbondedUtil.h"
7 #include "DeviceCUDA.h"
8 #include "SimParameters.h"
9 #include "TestArray.h"
10 #include "ComputeRestraintsCUDA.h"
11 #include "ComputeGridForceCUDA.h"
12 #include "ComputeConsForceCUDA.h"
13 #include "NamdEventsProfiling.h"
14 #include "AtomMap.h"
15 #include "common.h"
16 #include <algorithm> // std::fill()
18 //#define DEBUGM
19 //#define MIN_DEBUG_LEVEL 3
20 #ifdef NODEGROUP_FORCE_REGISTER
21 extern __thread DeviceCUDA *deviceCUDA;
22 
23 #if 1
24 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
25  size_t offset = 0; \
26  for (int i = 0; i < numPatchesHome; i++) { \
27  PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \
28  const int numPatchAtoms = current.numAtoms; \
29  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
30  offset += numPatchAtoms; \
31  } \
32  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream); \
33  } while(0);
34 
35 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
36  size_t offset = 0; \
37  for (int i = 0; i < numPatchesHomeAndProxy; i++) { \
38  PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
39  const int numPatchAtoms = current.numAtoms; \
40  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
41  offset += numPatchAtoms; \
42  } \
43  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream); \
44  } while(0);
45 
46 #else
47 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
48  size_t offset = 0; \
49  for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \
50  PatchDataSOA& current = elem->patch->patchDataSOA; \
51  const int numPatchAtoms = current.numAtoms; \
52  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
53  offset += numPatchAtoms; \
54  } \
55  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream); \
56  } while(0);
57 #endif
58 
59 // This function sets whatever SOA pointer I have an
60 void SequencerCUDA::registerSOAPointersToHost(){
61  // fprintf(stderr, "Dev[%d] setting memories\n", deviceID);
62  patchData->h_soa_pos_x[deviceIndex] = this->d_pos_x;
63  patchData->h_soa_pos_y[deviceIndex] = this->d_pos_y;
64  patchData->h_soa_pos_z[deviceIndex] = this->d_pos_z;
65 
66  patchData->h_soa_vel_x[deviceIndex] = this->d_vel_x;
67  patchData->h_soa_vel_y[deviceIndex] = this->d_vel_y;
68  patchData->h_soa_vel_z[deviceIndex] = this->d_vel_z;
69 
70  patchData->h_soa_fb_x[deviceIndex] = this->d_f_normal_x;
71  patchData->h_soa_fb_y[deviceIndex] = this->d_f_normal_y;
72  patchData->h_soa_fb_z[deviceIndex] = this->d_f_normal_z;
73 
74  patchData->h_soa_fn_x[deviceIndex] = this->d_f_nbond_x;
75  patchData->h_soa_fn_y[deviceIndex] = this->d_f_nbond_y;
76  patchData->h_soa_fn_z[deviceIndex] = this->d_f_nbond_z;
77 
78  patchData->h_soa_fs_x[deviceIndex] = this->d_f_slow_x;
79  patchData->h_soa_fs_y[deviceIndex] = this->d_f_slow_y;
80  patchData->h_soa_fs_z[deviceIndex] = this->d_f_slow_z;
81 
82  patchData->h_soa_id[deviceIndex] = this->d_idMig;
83  patchData->h_soa_vdwType[deviceIndex] = this->d_vdwType;
84  patchData->h_soa_sortOrder[deviceIndex] = this->d_sortOrder;
85  patchData->h_soa_unsortOrder[deviceIndex] = this->d_unsortOrder;
86  patchData->h_soa_charge[deviceIndex] = this->d_charge;
87  patchData->h_soa_patchCenter[deviceIndex] = this->d_patchCenter;
88  patchData->h_soa_migrationDestination[deviceIndex] = this->d_migrationDestination;
89  patchData->h_soa_sortSoluteIndex[deviceIndex] = this->d_sortSoluteIndex;
90 
91  patchData->h_soa_partition[deviceIndex] = this->d_partition;
92 
93  patchData->h_atomdata_AoS[deviceIndex] = (FullAtom*) this->d_atomdata_AoS_in;
94  patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
95 }
96 
97 void SequencerCUDA::copySOAHostRegisterToDevice(){
98  // This function gets the host-registered SOA device pointers and copies the register itself to the device
99  // NOTE: This needs to be called only when ALL masterPEs have safely called ::registerSOAPointersToHost()
100  cudaCheck(cudaSetDevice(deviceID));
101  copy_HtoD<double*>(patchData->h_soa_pos_x, this->d_peer_pos_x, nDevices, stream);
102  copy_HtoD<double*>(patchData->h_soa_pos_y, this->d_peer_pos_y, nDevices, stream);
103  copy_HtoD<double*>(patchData->h_soa_pos_z, this->d_peer_pos_z, nDevices, stream);
104 
105  copy_HtoD<float*>(patchData->h_soa_charge, this->d_peer_charge, nDevices, stream);
106  if (simParams->alchOn) {
107  copy_HtoD<int*>(patchData->h_soa_partition, this->d_peer_partition, nDevices, stream);
108  }
109 
110  copy_HtoD<double*>(patchData->h_soa_vel_x, this->d_peer_vel_x, nDevices, stream);
111  copy_HtoD<double*>(patchData->h_soa_vel_y, this->d_peer_vel_y, nDevices, stream);
112  copy_HtoD<double*>(patchData->h_soa_vel_z, this->d_peer_vel_z, nDevices, stream);
113 
114  copy_HtoD<double*>(patchData->h_soa_fb_x, this->d_peer_fb_x, nDevices, stream);
115  copy_HtoD<double*>(patchData->h_soa_fb_y, this->d_peer_fb_y, nDevices, stream);
116  copy_HtoD<double*>(patchData->h_soa_fb_z, this->d_peer_fb_z, nDevices, stream);
117 
118  copy_HtoD<double*>(patchData->h_soa_fn_x, this->d_peer_fn_x, nDevices, stream);
119  copy_HtoD<double*>(patchData->h_soa_fn_y, this->d_peer_fn_y, nDevices, stream);
120  copy_HtoD<double*>(patchData->h_soa_fn_z, this->d_peer_fn_z, nDevices, stream);
121 
122  copy_HtoD<double*>(patchData->h_soa_fs_x, this->d_peer_fs_x, nDevices, stream);
123  copy_HtoD<double*>(patchData->h_soa_fs_y, this->d_peer_fs_y, nDevices, stream);
124  copy_HtoD<double*>(patchData->h_soa_fs_z, this->d_peer_fs_z, nDevices, stream);
125 
126  copy_HtoD<int4*>(patchData->h_soa_migrationDestination, this->d_peer_migrationDestination, nDevices, stream);
127  copy_HtoD<int*>(patchData->h_soa_sortSoluteIndex, this->d_peer_sortSoluteIndex, nDevices, stream);
128 
129  copy_HtoD<int*>(patchData->h_soa_id, this->d_peer_id, nDevices, stream);
130  copy_HtoD<int*>(patchData->h_soa_vdwType, this->d_peer_vdwType, nDevices, stream);
131  copy_HtoD<int*>(patchData->h_soa_sortOrder, this->d_peer_sortOrder, nDevices, stream);
132  copy_HtoD<int*>(patchData->h_soa_unsortOrder, this->d_peer_unsortOrder, nDevices, stream);
133  copy_HtoD<double3*>(patchData->h_soa_patchCenter, this->d_peer_patchCenter, nDevices, stream);
134 
135  copy_HtoD<FullAtom*>(patchData->h_atomdata_AoS, this->d_peer_atomdata, nDevices, stream);
136  copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);
137 
138  // aggregate device pointers
139  for(int i = 0; i < this->nDevices; i++)
140  h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
141  copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
142 }
143 
144 void SequencerCUDA::printSOAPositionsAndVelocities() {
145  BigReal* h_pos_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
146  BigReal* h_pos_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
147  BigReal* h_pos_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
148 
149  BigReal* h_vel_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
150  BigReal* h_vel_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
151  BigReal* h_vel_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
152 
153  // DMC think this condition was a holdover from the NCCL code path
154  if(false && mGpuOn){
155  copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
156  copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
157  copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
158  }else{
159  copy_DtoH_sync<BigReal>(d_pos_x, h_pos_x, numAtomsHome);
160  copy_DtoH_sync<BigReal>(d_pos_y, h_pos_y, numAtomsHome);
161  copy_DtoH_sync<BigReal>(d_pos_z, h_pos_z, numAtomsHome);
162  }
163 
164  copy_DtoH_sync<BigReal>(d_vel_x, h_vel_x, numAtomsHome);
165  copy_DtoH_sync<BigReal>(d_vel_y, h_vel_y, numAtomsHome);
166  copy_DtoH_sync<BigReal>(d_vel_z, h_vel_z, numAtomsHome);
167 
168  CmiLock(this->patchData->printlock);
169  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
170  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
171  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
172  for(int i =0 ; i < numPatchesHome; i++){
173  CudaLocalRecord record = localPatches[i];
174  const int patchID = record.patchID;
175  const int stride = record.bufferOffset;
176  const int numPatchAtoms = record.numAtoms;
177  PatchDataSOA& current = homePatches[i]->patchDataSOA;
178 
179  fprintf(stderr, "Patch [%d]:\n", patchID);
180  for(int j = 0; j < numPatchAtoms; j++){
181  fprintf(stderr, " [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.id[j],
182  h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
183  h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
184  }
185 
186  }
187  CmiUnlock(this->patchData->printlock);
188 
189  free(h_pos_x);
190  free(h_pos_y);
191  free(h_pos_z);
192  free(h_vel_x);
193  free(h_vel_y);
194  free(h_vel_z);
195 }
196 
197 void SequencerCUDA::printSOAForces(char *prefix) {
198  BigReal* h_f_normal_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
199  BigReal* h_f_normal_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
200  BigReal* h_f_normal_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
201 
202  BigReal* h_f_nbond_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
203  BigReal* h_f_nbond_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
204  BigReal* h_f_nbond_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
205 
206  BigReal* h_f_slow_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
207  BigReal* h_f_slow_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
208  BigReal* h_f_slow_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
209 
210  copy_DtoH_sync<BigReal>(d_f_normal_x, h_f_normal_x, numAtomsHome);
211  copy_DtoH_sync<BigReal>(d_f_normal_y, h_f_normal_y, numAtomsHome);
212  copy_DtoH_sync<BigReal>(d_f_normal_z, h_f_normal_z, numAtomsHome);
213 
214  copy_DtoH_sync<BigReal>(d_f_nbond_x, h_f_nbond_x, numAtomsHome);
215  copy_DtoH_sync<BigReal>(d_f_nbond_y, h_f_nbond_y, numAtomsHome);
216  copy_DtoH_sync<BigReal>(d_f_nbond_z, h_f_nbond_z, numAtomsHome);
217 
218  copy_DtoH_sync<BigReal>(d_f_slow_x, h_f_slow_x, numAtomsHome);
219  copy_DtoH_sync<BigReal>(d_f_slow_y, h_f_slow_y, numAtomsHome);
220  copy_DtoH_sync<BigReal>(d_f_slow_z, h_f_slow_z, numAtomsHome);
221 
222  // Great, now let's
223  CmiLock(this->patchData->printlock);
224  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
225  char fname[100];
226  fprintf(stderr, "PE[%d] force printout\n", CkMyPe());
227  for(int i =0 ; i < numPatchesHome; i++){
228  CudaLocalRecord record = localPatches[i];
229  const int patchID = record.patchID;
230  const int stride = record.bufferOffset;
231  const int numPatchAtoms = record.numAtoms;
232  FILE *outfile=stderr;
233  if(prefix!=NULL)
234  {
235  snprintf(fname,100, "%s-patch-%d", prefix, patchID);
236  outfile = fopen(fname, "w");
237  }
238  fprintf(outfile, "Patch [%d]:\n", patchID);
239  for(int j = 0; j < numPatchAtoms; j++){
240  fprintf(outfile, " [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
241  h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
242  h_f_nbond_x[stride+j], h_f_nbond_y[stride+j], h_f_nbond_z[stride+j],
243  h_f_slow_x[stride+j], h_f_slow_y[stride+j], h_f_slow_z[stride+j] );
244  }
245  if(prefix!=NULL) fclose(outfile);
246  }
247 
248  CmiUnlock(this->patchData->printlock);
249 
250  free(h_f_normal_x);
251  free(h_f_normal_y);
252  free(h_f_normal_z);
253 
254  free(h_f_nbond_x);
255  free(h_f_nbond_y);
256  free(h_f_nbond_z);
257 
258  free(h_f_slow_x);
259  free(h_f_slow_y);
260  free(h_f_slow_z);
261 
262 }
263 
264 SequencerCUDA* SequencerCUDA::InstanceInit(const int deviceID_ID,
265  SimParameters *const sim_Params) {
266  if (CkpvAccess(SequencerCUDA_instance) == 0) {
267  CkpvAccess(SequencerCUDA_instance) = new SequencerCUDA(deviceID_ID, sim_Params);
268  }
269  return CkpvAccess(SequencerCUDA_instance);
270 }
271 
272 SequencerCUDA::SequencerCUDA(const int deviceID_ID,
273  SimParameters *const sim_Params):
274  deviceID(deviceID_ID), simParams(sim_Params)
275 {
276  restraintsKernel = NULL;
277  SMDKernel = NULL;
278  groupRestraintsKernel = NULL;
279  gridForceKernel = NULL;
280  consForceKernel = NULL;
281  lonepairsKernel = nullptr;
282 #if 1
283  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
284  patchData = cpdata.ckLocalBranch();
285 #endif
286  initialize();
287  CUDASequencerKernel = new SequencerCUDAKernel();
288  CUDAMigrationKernel = new MigrationCUDAKernel();
289  num_used_grids = simParams->alchGetNumOfPMEGrids();
290  used_grids.resize(num_used_grids, 0);
291  if (simParams->alchFepOn) {
292  // at least two grids are used
293  used_grids[0] = 0;
294  used_grids[1] = 1;
295  // if alchDecouple then two more grids are used
296  if (simParams->alchDecouple) {
297  used_grids[2] = 2;
298  used_grids[3] = 3;
299  // an extra for soft-core potential
300  if (simParams->alchElecLambdaStart > 0) {
301  used_grids[4] = 4;
302  }
303  } else {
304  // in this case alchDecouple is false
305  // but if there is still soft-core potential
306  // then a total of 3 grids are used
307  // mark the last grid for soft-core potential
308  if (simParams->alchElecLambdaStart > 0) {
309  used_grids[2] = 4;
310  }
311  }
312  }
313  if (simParams->alchThermIntOn) {
314  used_grids[0] = 0;
315  used_grids[1] = 1;
316  // in TI, no matter whether soft-core potential is used
317  // it has at least three grids
318  if (simParams->alchDecouple) {
319  used_grids[2] = 2;
320  used_grids[3] = 3;
321  used_grids[4] = 4;
322  } else {
323  used_grids[2] = 4;
324  }
325  }
326  rescalePairlistTolerance = false;
327 }
328 
329 SequencerCUDA::~SequencerCUDA(){
330  cudaCheck(cudaSetDevice(deviceID));
331  deallocateArrays();
332  deallocateStaticArrays();
333  deallocate_device<SettleParameters>(&sp);
334  deallocate_device<int>(&settleList);
335  deallocate_device<CudaRattleElem>(&rattleList);
336  deallocate_device<int>(&d_consFailure);
337  if (CUDASequencerKernel != NULL) delete CUDASequencerKernel;
338  if (CUDAMigrationKernel != NULL) delete CUDAMigrationKernel;
339  if (restraintsKernel != NULL) delete restraintsKernel;
340  if(SMDKernel != NULL) delete SMDKernel;
341  if (groupRestraintsKernel != NULL) delete groupRestraintsKernel;
342  if (gridForceKernel != NULL) delete gridForceKernel;
343  if (lonepairsKernel != nullptr) delete lonepairsKernel;
344  if (consForceKernel != NULL) delete consForceKernel;
345 #if 0
346  cudaCheck(cudaStreamDestroy(stream));
347 #endif
348  cudaCheck(cudaStreamDestroy(stream2));
349  curandCheck(curandDestroyGenerator(curandGen));
350  CmiDestroyLock(printlock);
351 
352 }
353 
354 void SequencerCUDA::zeroScalars(){
355  numAtomsHomeAndProxyAllocated = 0;
356  numAtomsHomeAllocated = 0;
357  buildRigidLists = true;
358  numPatchesCheckedIn = 0;
359  numPatchesReady= 0;
360 }
361 
362 void SequencerCUDA::initialize(){
363  cudaCheck(cudaSetDevice(deviceID));
364  nDevices = deviceCUDA->getNumDevice() + 1 *deviceCUDA->isGpuReservedPme();
365  deviceIndex = deviceCUDA->getDeviceIndex();
366 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
367  int leastPriority, greatestPriority;
368  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
369 #if 0
370  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
371 #else
372  stream = (CkMyPe() == deviceCUDA->getMasterPe()) ? patchData->devData[deviceIndex].nbond_stream : 0;
373 #endif
374  cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
375 #else
376  cudaCheck(cudaStreamCreate(&stream));
377  cudaCheck(cudaStreamCreate(&stream2));
378 #endif
379  curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
380  // each PE's seed needs to be different
381  unsigned long long seed = simParams->randomSeed + CkMyPe();
382  curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));
383 
384  numAtomsHomeAllocated = 0;
385  numAtomsHomeAndProxyAllocated = 0;
386 
387  totalMarginViolations = 0;
388  buildRigidLists = true;
389  numPatchesCheckedIn = 0;
390  numPatchesReady= 0;
391  PatchMap* patchMap = PatchMap::Object();
392 #if 1
393  numPatchesGlobal = patchMap->numPatches();
394 #else
395  numPatchesGlobal = patchMap->homePatchList()->size();
396 #endif
397  mGpuOn = nDevices > 1;
398  // Great, now allocate the queue
399  // allocate_device<unsigned int>(&deviceQueue, nDevices);
400  // cudaCheck(cudaMemset(deviceQueue, 999, sizeof(unsigned int)* nDevices));
401  // Allocates and registers the local queue in patchData
402  // patchData->d_queues[deviceIndex] = deviceQueue;
403 
404  printlock = CmiCreateLock();
405 
406  const int numPes = CkNumPes();
407  if (!mGpuOn) {
408  atomMapList.resize(numPes);
409  }
410 
411  //Allocates SOA registers
412  allocate_device<double*>(&d_peer_pos_x, nDevices);
413  allocate_device<double*>(&d_peer_pos_y, nDevices);
414  allocate_device<double*>(&d_peer_pos_z, nDevices);
415  allocate_device<float*>(&d_peer_charge, nDevices);
416  if (simParams->alchOn) {
417  allocate_device<int*>(&d_peer_partition, nDevices);
418  }
419  allocate_device<double*>(&d_peer_vel_x, nDevices);
420  allocate_device<double*>(&d_peer_vel_y, nDevices);
421  allocate_device<double*>(&d_peer_vel_z, nDevices);
422 
423  if (simParams->fixedAtomsOn) {
424  allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
425  allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
426  allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
427  allocate_device<double3>(&d_fixForceNormal, 1);
428  allocate_device<double3>(&d_fixForceNbond, 1);
429  allocate_device<double3>(&d_fixForceSlow, 1);
430  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
431  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
432  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
433  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
434  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
435  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
436  }
437 
438  allocate_device<double*>(&d_peer_fb_x, nDevices);
439  allocate_device<double*>(&d_peer_fb_y, nDevices);
440  allocate_device<double*>(&d_peer_fb_z, nDevices);
441  allocate_device<double*>(&d_peer_fn_x, nDevices);
442  allocate_device<double*>(&d_peer_fn_y, nDevices);
443  allocate_device<double*>(&d_peer_fn_z, nDevices);
444  allocate_device<double*>(&d_peer_fs_x, nDevices);
445  allocate_device<double*>(&d_peer_fs_y, nDevices);
446  allocate_device<double*>(&d_peer_fs_z, nDevices);
447 
448  allocate_device<int4*>(&d_peer_migrationDestination, nDevices);
449  allocate_device<int*>(&d_peer_sortSoluteIndex, nDevices);
450  allocate_device<int*>(&d_peer_id, nDevices);
451  allocate_device<int*>(&d_peer_vdwType, nDevices);
452  allocate_device<int*>(&d_peer_sortOrder, nDevices);
453  allocate_device<int*>(&d_peer_unsortOrder, nDevices);
454  allocate_device<double3*>(&d_peer_patchCenter, nDevices);
455 
456  allocate_device<FullAtom*>(&d_peer_atomdata, nDevices);
457  allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);
458 
459  allocate_device<bool*>(&d_patchRecordHasForces, nDevices);
460 
461  allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
462  // Patch-related host datastructures
463  allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
464  allocate_host<double3>(&patchCenter, numPatchesGlobal);
465  allocate_host<int>(&globalToLocalID, numPatchesGlobal);
466  allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
467  allocate_host<double3>(&awayDists, numPatchesGlobal);
468  allocate_host<double3>(&patchMin, numPatchesGlobal);
469  allocate_host<double3>(&patchMax, numPatchesGlobal);
470  //allocate_host<Lattice>(&lattices, numPatchesGlobal);
471  allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal); // only needed for langevin
472  allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
473  allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
474  allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);
475 
476  //Patch-related device datastructures
477  allocate_device<double3>(&d_awayDists, numPatchesGlobal);
478  allocate_device<double3>(&d_patchMin, numPatchesGlobal);
479  allocate_device<double3>(&d_patchMax, numPatchesGlobal);
480  allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
481  allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
482  allocate_device<double3>(&d_patchCenter, numPatchesGlobal);
483  allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
484  allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
485  allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
486  allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
487  allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);
488 
489  // Allocate host memory for scalar variables
490  allocate_device<int>(&d_killme, 1);
491  allocate_device<char>(&d_barrierFlag, 1);
492  allocate_device<unsigned int>(&d_tbcatomic, 5);
493  allocate_device<BigReal>(&d_kineticEnergy, ATOMIC_BINS);
494  allocate_device<BigReal>(&d_intKineticEnergy, ATOMIC_BINS);
495  allocate_device<BigReal>(&d_momentum_x, ATOMIC_BINS);
496  allocate_device<BigReal>(&d_momentum_y, ATOMIC_BINS);
497  allocate_device<BigReal>(&d_momentum_z, ATOMIC_BINS);
498  allocate_device<BigReal>(&d_angularMomentum_x, ATOMIC_BINS);
499  allocate_device<BigReal>(&d_angularMomentum_y, ATOMIC_BINS);
500  allocate_device<BigReal>(&d_angularMomentum_z, ATOMIC_BINS);
501  allocate_device<cudaTensor>(&d_virial, ATOMIC_BINS);
502  allocate_device<cudaTensor>(&d_intVirialNormal, ATOMIC_BINS);
503  allocate_device<cudaTensor>(&d_intVirialNbond, ATOMIC_BINS);
504  allocate_device<cudaTensor>(&d_intVirialSlow, ATOMIC_BINS);
505  allocate_device<cudaTensor>(&d_rigidVirial, ATOMIC_BINS);
506  // for lone pairs
507  allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
508  allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
509  allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
510  //space for globalmaster forces on device
511  allocate_device<cudaTensor>(&d_extVirial, ATOMIC_BINS * EXT_FORCE_TOTAL);
512  allocate_device<double3>(&d_extForce, ATOMIC_BINS * EXT_FORCE_TOTAL);
513  allocate_device<double>(&d_extEnergy, ATOMIC_BINS * EXT_FORCE_TOTAL);
514 
515  allocate_device<SettleParameters>(&sp, 1);
516 
517  //allocates host scalars in host-mapped, pinned memory
518  allocate_host<int>(&killme, 1);
519  allocate_host<BigReal>(&kineticEnergy, 1);
520  allocate_host<BigReal>(&intKineticEnergy, 1);
521  allocate_host<BigReal>(&kineticEnergy_half, 1);
522  allocate_host<BigReal>(&intKineticEnergy_half, 1);
523  allocate_host<BigReal>(&momentum_x, 1);
524  allocate_host<BigReal>(&momentum_y, 1);
525  allocate_host<BigReal>(&momentum_z, 1);
526  allocate_host<BigReal>(&angularMomentum_x, 1);
527  allocate_host<BigReal>(&angularMomentum_y, 1);
528  allocate_host<BigReal>(&angularMomentum_z, 1);
529  allocate_host<int>(&consFailure, 1);
530  allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
531  allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
532  allocate_host<unsigned int>(&h_marginViolations, 1);
533  allocate_host<unsigned int>(&h_periodicCellSmall, 1);
534 
535  // allocates host cudaTensors in host-mapped, pinned memory
536  allocate_host<cudaTensor>(&virial, 1);
537  allocate_host<cudaTensor>(&virial_half, 1);
538  allocate_host<cudaTensor>(&intVirialNormal, 1);
539  allocate_host<cudaTensor>(&intVirialNormal_half, 1);
540  allocate_host<cudaTensor>(&intVirialNbond, 1);
541  allocate_host<cudaTensor>(&intVirialSlow, 1);
542  allocate_host<cudaTensor>(&rigidVirial, 1);
543  allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
544  allocate_host<cudaTensor>(&lpVirialNormal, 1);
545  allocate_host<cudaTensor>(&lpVirialNbond, 1);
546  allocate_host<cudaTensor>(&lpVirialSlow, 1);
547 
548  // These arrays will get allocated when total forces of the GPU global calculations are requested
549  d_f_saved_nbond_x = nullptr;
550  d_f_saved_nbond_y = nullptr;
551  d_f_saved_nbond_z = nullptr;
552  d_f_saved_slow_x = nullptr;
553  d_f_saved_slow_y = nullptr;
554  d_f_saved_slow_z = nullptr;
555 
556  //Sets values for scalars
557  *kineticEnergy = 0.0;
558  *intKineticEnergy = 0.0;
559  *kineticEnergy_half = 0.0;
560  *intKineticEnergy_half = 0.0;
561  *momentum_x = 0.0;
562  *momentum_y = 0.0;
563  *momentum_z = 0.0;
564  *angularMomentum_x = 0.0;
565  *angularMomentum_y = 0.0;
566  *angularMomentum_z = 0.0;
567  *consFailure = 0;
568  *killme = 0;
569 
570  // JM: Basic infrastructure to time kernels
571  // XXX TODO: Add timing functions to be switched on/off for each of these
572  t_total = 0;
573  t_vverlet = 0;
574  t_pairlistCheck = 0;
575  t_setComputePositions = 0;
576  t_accumulateForceKick = 0;
577  t_rattle = 0;
578  t_submitHalf = 0;
579  t_submitReductions1 = 0;
580  t_submitReductions2 = 0;
581 
582  cudaEventCreate(&eventStart);
583  cudaEventCreate(&eventStop);
584  cudaCheck(cudaMemset(d_patchNewTolerance, 0, sizeof(BigReal)*numPatchesGlobal));
585  cudaCheck(cudaMemset(d_kineticEnergy, 0, sizeof(BigReal)));
586  cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int) * 5));
587  cudaCheck(cudaMemset(d_momentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
588  cudaCheck(cudaMemset(d_momentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
589  cudaCheck(cudaMemset(d_momentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
590  cudaCheck(cudaMemset(d_angularMomentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
591  cudaCheck(cudaMemset(d_angularMomentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
592  cudaCheck(cudaMemset(d_angularMomentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
593  cudaCheck(cudaMemset(d_intKineticEnergy, 0, ATOMIC_BINS * sizeof(BigReal)));
594  cudaCheck(cudaMemset(d_virial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
595  cudaCheck(cudaMemset(d_rigidVirial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
596  cudaCheck(cudaMemset(d_intVirialNormal, 0, ATOMIC_BINS * sizeof(cudaTensor)));
597  cudaCheck(cudaMemset(d_intVirialNbond, 0, ATOMIC_BINS * sizeof(cudaTensor)));
598  cudaCheck(cudaMemset(d_intVirialSlow, 0, ATOMIC_BINS * sizeof(cudaTensor)));
599  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
600  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
601  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
602  cudaCheck(cudaMemset(d_extVirial, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(cudaTensor)));
603  cudaCheck(cudaMemset(d_extForce, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double3)));
604  cudaCheck(cudaMemset(d_extEnergy, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double)));
605 
606  memset(h_marginViolations, 0, sizeof(unsigned int));
607  memset(h_periodicCellSmall, 0, sizeof(unsigned int));
608  memset(virial, 0, sizeof(cudaTensor));
609  memset(rigidVirial, 0, sizeof(cudaTensor));
610  memset(intVirialNormal, 0, sizeof(cudaTensor));
611  memset(intVirialNbond, 0, sizeof(cudaTensor));
612  memset(intVirialSlow, 0, sizeof(cudaTensor));
613  memset(lpVirialNormal, 0, sizeof(cudaTensor));
614  memset(lpVirialNbond, 0, sizeof(cudaTensor));
615  memset(lpVirialSlow, 0, sizeof(cudaTensor));
616  memset(globalToLocalID, -1, sizeof(int)*numPatchesGlobal);
617 
618  settleList = NULL;
619  settleListSize = 0;
620  rattleList = NULL;
621  rattleListSize = 0;
622  d_consFailure = NULL;
623  d_consFailureSize = 0;
624 
626 
627  // JM: I can bundle the CompAtom and CudaAtomList pointers from the
628  // patches here and fill the Host-arrays after integration. :]
629  // if single-gpu simulation, numPatchesHome is the same as numPatchesGlobal
630  numPatchesHome = numPatchesGlobal;
631 
632 
633  int count = 0;
634  // Single-GPU case
635  if(!mGpuOn){
636  // JM NOTE: This works for a single device only, but it's fast if we want to have multiple-PE's sharing it
637  if(deviceCUDA->getMasterPe() == CkMyPe()){
638  for(int i = 0; i < numPes; i++) {
639  atomMapList[i] = AtomMap::ObjectOnPe(i);
640  PatchMap* patchMap = PatchMap::ObjectOnPe(i);
641  int npatch = patchMap->numPatches();
642  for (int j = 0; j < npatch; j++) {
643  HomePatch *patch = patchMap->homePatch(j);
644  // JM NOTE: This data structure can be preserved through migration steps
645  if(patch != NULL) {
646  patchList.push_back(patch);
647  patchNewTolerance[count++] =
648  0.5 * ( simParams->pairlistDist - simParams->cutoff );
649  numAtomsHomeAndProxy += patchMap->patch(j)->getNumAtoms();
650 
651  patchData->devData[deviceIndex].patches.push_back(patch);
652  patchListHomeAndProxy.push_back(patch);
653  }
654  }
655  }
656  }
657  }else{
658  // Multi-device case
659  /* The logic here is a bit trickier than the one on the single-GPU case.
660  Each GPU will only allocate space for its home patches and any required proxy patches,
661  This is going to eliminate the possibility of using NCCL-based all reduce for communication
662  but DMC thinks this is okay...
663  */
664  if(deviceCUDA->getMasterPe() == CkMyPe()){
665  // Haochuan: For multi-GPU cases, we need to find the atom maps from
666  // all "non-master PEs" that use the same device as the master PE.
667  // 1. Get the current device ID.
668  const int currentDevice = deviceCUDA->getDeviceIDforPe(CkMyPe());
669  // 2. Iterate over all PEs and find those that have the same device ID as this master PE
670  for (int i = 0; i < numPes; ++i) {
671  if (deviceCUDA->getDeviceIDforPe(i) == currentDevice) {
672  atomMapList.push_back(AtomMap::ObjectOnPe(i));
673  }
674  }
675  for(int i = 0; i < deviceCUDA->getNumPesSharingDevice(); i++){
677  for (int j = 0; j < pm->numPatches(); j++) {
678  // Aggregates the patches in a separate data structure for now
679  HomePatch *patch = pm->homePatch(j);
680  if(patch != NULL) {
681  patchData->devData[deviceIndex].patches.push_back(patch);
682  }
683  }
684  }
685 
686  // if MGPU is On, we also try to set up peer access
688 #ifdef NAMD_NCCL_ALLREDUCE
689  deviceCUDA->setNcclUniqueId(patchData->ncclId);
690  deviceCUDA->setupNcclComm();
691 #endif
692 
693  }
694  }
695  PatchMap *pm = PatchMap::Object();
696  isPeriodic = (pm->periodic_a() && pm->periodic_b() && pm->periodic_c());
697 
698  // XXX TODO decide how the biasing methods will work in the future -
699  // for multiple devices this will be a problem, how to solve it?
700  if (simParams->constraintsOn) {
701  restraintsKernel = new ComputeRestraintsCUDA(patchList, atomMapList,
702  stream, mGpuOn);
703  }
704  if (simParams->SMDOn) {
705  if(deviceCUDA->getMasterPe() == CkMyPe()){
706  SMDKernel = new ComputeSMDCUDA(patchList, simParams->SMDk, simParams->SMDk2,
707  simParams->SMDVel, make_double3(simParams->SMDDir.x, simParams->SMDDir.y, simParams->SMDDir.z),
708  simParams->SMDOutputFreq, simParams->firstTimestep, simParams->SMDFile,
710  Node::Object()->molecule->numAtoms, nDevices, deviceIndex, mGpuOn);
711  // need to set smd atom lists in the mgpu case
712  if(mGpuOn)
713  {
714  SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
716  SMDKernel->initPeerCOM(cudaMgr->curSMDCOM, stream);
717  }
718  }
719  }
720  if (simParams->groupRestraintsOn) {
721  if(deviceCUDA->getMasterPe() == CkMyPe()){
722  groupRestraintsKernel = new ComputeGroupRestraintsCUDA(simParams->outputEnergies,
723  simParams->groupRestraints, mGpuOn, nDevices, deviceIndex);
724  if(mGpuOn)
725  {
726  groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
727  groupRestraintsKernel->initPeerCOM(stream);
728  }
729  }
730  }
731  if (simParams->mgridforceOn || simParams->gridforceOn ){
732  if(deviceCUDA->getMasterPe() == CkMyPe()){
733  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
734  }
735  }
736  if (Node::Object()->molecule->is_lonepairs_psf) {
737  lonepairsKernel = new ComputeLonepairsCUDA();
738  }
739  if (simParams->consForceOn){
740  if(deviceCUDA->getMasterPe() == CkMyPe()){
741  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
742  }
743  }
744 
745 }
746 
747 /* events like ScriptTcl changes to global data require us to rebuild
748  some of the data structures used by some of the kernels. i.e., the
749  gridForceKernel needs to change when the grid, or grid related
750  data, is changed.
751 
752  Technically these may all be in use and updated at different
753  intervals, but that is a weird edge case that doesn't justify
754  adding a whole other sync path. If that happens, we'll be doing
755  this slightly too often. Such a hypothetical user has already
756  bought in to periodically sacrificing performance to radically
757  change the system being simulated at run time in several
758  different ways on different triggers, so as long as this overhead
759  is substantially better than a full restart, it probably doesn't
760  matter.
761  */
762 
763 void SequencerCUDA::updateDeviceKernels()
764 {
765  if (simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn){
766  // clean out the old and bring in the new if we're a master PE
767  if(deviceCUDA->getMasterPe() == CkMyPe()){
768  if(patchData->updateCounter.fetch_sub(1)>=1)
769  {
770  if(gridForceKernel!=NULL)
771  {
772  delete gridForceKernel;
773  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
774  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
775  }
776  if(consForceKernel!=NULL)
777  {
778  delete consForceKernel;
779  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,
780  mGpuOn);
781  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
782  }
783  }
784  }
785  }
786 }
787 
788 bool SequencerCUDA::reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy)
789 {
790  cudaCheck(cudaSetDevice(deviceID));
791  const float OVERALLOC = 1.5f;
792 
793  if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
794  return false;
795  }
796 
797  // Before deallocate the f_saved_nbond_* and f_saved_slow_*,
798  // check if they are nullptr and allocate them if they are null
799  bool realloc_gpu_saved_force = false;
800  if (d_f_saved_nbond_x != nullptr || d_f_saved_slow_x != nullptr) {
801  realloc_gpu_saved_force = true;
802  }
803 
804 
805  deallocateArrays();
806 
807  numAtomsHomeAndProxyAllocated = (int) ((float) in_numAtomsHomeAndProxy * OVERALLOC);
808  numAtomsHomeAllocated = (int) ((float) in_numAtomsHome * OVERALLOC);
809 
810  allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
811  allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
812  allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
813  allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
814  allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
815  allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
816  allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
817  allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
818  allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
819  allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
820  allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
821  allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
822  if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
823  allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
824  allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
825  allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
826  }
827  allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
828  allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
829  allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);
830 
831  allocate_host<double>(&recipMass, numAtomsHomeAllocated);
832  allocate_host<double>(&vel_x, numAtomsHomeAllocated);
833  allocate_host<double>(&vel_y, numAtomsHomeAllocated);
834  allocate_host<double>(&vel_z, numAtomsHomeAllocated);
835  allocate_host<char3>(&transform, numAtomsHomeAllocated);
836  allocate_host<float>(&mass, numAtomsHomeAllocated);
837  if (simParams->alchOn) {
838  allocate_host<int>(&partition, numAtomsHomeAndProxyAllocated);
839  }
840  allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
841  allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
842  allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);
843 
844  // array buffers for pseudorandom normal distribution must be even length
845  // choose n to be the smallest even number >= numIntegrationAtoms
846  // guarantees that n is always > numIntegrationAtoms
847  int n = (numAtomsHomeAllocated + 1) & (~1);
848  allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
849  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
850  allocate_host<int>(&atomFixed, numAtomsHomeAllocated);
851  if (simParams->fixedAtomsOn) {
852  allocate_host<int>(&groupFixed, numAtomsHomeAllocated);
853  allocate_host(&fixedPosition_x, numAtomsHomeAllocated);
854  allocate_host(&fixedPosition_y, numAtomsHomeAllocated);
855  allocate_host(&fixedPosition_z, numAtomsHomeAllocated);
856  }
857  allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);
858 
859 
860  if (simParams->useDeviceMigration) {
861  allocate_host<int>(&idMig, numAtomsHomeAllocated);
862  allocate_host<int>(&vdwType, numAtomsHomeAllocated);
863  }
864 
865  allocate_device<double>(&d_f_raw, 9 * numAtomsHomeAndProxyAllocated); // Total number of force buffers
866  d_f_normal_x = &d_f_raw[numAtomsHomeAndProxyAllocated*0];
867  d_f_normal_y = &d_f_raw[numAtomsHomeAndProxyAllocated*1];
868  d_f_normal_z = &d_f_raw[numAtomsHomeAndProxyAllocated*2];
869  d_f_nbond_x = &d_f_raw[numAtomsHomeAndProxyAllocated*3];
870  d_f_nbond_y = &d_f_raw[numAtomsHomeAndProxyAllocated*4];
871  d_f_nbond_z = &d_f_raw[numAtomsHomeAndProxyAllocated*5];
872  d_f_slow_x = &d_f_raw[numAtomsHomeAndProxyAllocated*6];
873  d_f_slow_y = &d_f_raw[numAtomsHomeAndProxyAllocated*7];
874  d_f_slow_z = &d_f_raw[numAtomsHomeAndProxyAllocated*8];
875  allocate_device<double>(&d_pos_raw, 3 * numAtomsHomeAndProxyAllocated);
876  d_pos_x = &d_pos_raw[numAtomsHomeAndProxyAllocated*0];
877  d_pos_y = &d_pos_raw[numAtomsHomeAndProxyAllocated*1];
878  d_pos_z = &d_pos_raw[numAtomsHomeAndProxyAllocated*2];
879  allocate_device<float>(&d_charge, numAtomsHomeAndProxyAllocated);
880  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
881  allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
882  allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
883  allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
884  }
885  if (realloc_gpu_saved_force) {
886  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
887  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
888  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
889  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
890  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
891  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
892  }
893  allocate_device<int>(&d_sortOrder, numAtomsHomeAndProxyAllocated);
894  allocate_device<int>(&d_unsortOrder, numAtomsHomeAndProxyAllocated);
895 
896  // allocate memory for backup forces in MC barostat
897  if (simParams->monteCarloPressureOn) {
898  // Total number of backup force and positions buffers
899  allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
900  allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
901  d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
902  d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
903  d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
904  d_f_nbondMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
905  d_f_nbondMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
906  d_f_nbondMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
907  d_f_slowMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
908  d_f_slowMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
909  d_f_slowMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
910  d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
911  d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
912  d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];
913 
914  allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
915  allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
916  allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
917  allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
918  // we can use molecule_size + 1, rather than atom size
919  allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
920  }
921 
922  if (simParams->alchOn) {
923  allocate_device<int>(&d_partition, numAtomsHomeAndProxyAllocated);
924  }
925 
926  allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
927  d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
928  d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
929  d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
930  allocate_device<double>(&d_vel_x, numAtomsHomeAllocated);
931  allocate_device<double>(&d_vel_y, numAtomsHomeAllocated);
932  allocate_device<double>(&d_vel_z, numAtomsHomeAllocated);
933  allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
934  allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
935  allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
936  allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
937  allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
938  allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
939  allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
940  allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
941  allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
942  allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
943  allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
944  allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
945  allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
946  allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);
947 
948  allocate_device<float>(&d_mass, numAtomsHomeAllocated);
949  allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
950  allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
951  allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
952  allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
953  allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
954  allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
955  allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
956  allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
957  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
958  allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
959  if (simParams->fixedAtomsOn) {
960  allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
961  allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
962  allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
963  allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
964  }
965 
966  if (simParams->useDeviceMigration) {
967  allocate_device<int>(&d_idMig, numAtomsHomeAndProxyAllocated);
968  allocate_device<int>(&d_vdwType, numAtomsHomeAndProxyAllocated);
969  allocate_device<FullAtom>(&d_atomdata_AoS, numAtomsHomeAllocated);
970  allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
971  allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
972  allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
973  }
974 
975  // These arrays will get allocated when total forces of the GPU global calculations are requested
976  d_f_saved_nbond_x = nullptr;
977  d_f_saved_nbond_y = nullptr;
978  d_f_saved_nbond_z = nullptr;
979  d_f_saved_slow_x = nullptr;
980  d_f_saved_slow_y = nullptr;
981  d_f_saved_slow_z = nullptr;
982 
983  // Memsets the d_pos arrays in order to reduce them afterwards;
984  memset(pos_x, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
985  memset(pos_y, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
986  memset(pos_z, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
987  cudaCheck(cudaMemset(d_pos_x, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
988  cudaCheck(cudaMemset(d_pos_y, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
989  cudaCheck(cudaMemset(d_pos_z, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
990  cudaCheck(cudaMemset(d_vel_x, 0 , sizeof(double)*numAtomsHomeAllocated));
991  cudaCheck(cudaMemset(d_vel_y, 0 , sizeof(double)*numAtomsHomeAllocated));
992  cudaCheck(cudaMemset(d_vel_z, 0 , sizeof(double)*numAtomsHomeAllocated));
993 
994  cudaCheck(cudaMemset(d_posNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
995  cudaCheck(cudaMemset(d_posNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
996  cudaCheck(cudaMemset(d_posNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
997  cudaCheck(cudaMemset(d_velNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
998  cudaCheck(cudaMemset(d_velNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
999  cudaCheck(cudaMemset(d_velNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
1000 
1001  return true;
1002 }
1003 
1004 void SequencerCUDA::reallocateMigrationDestination() {
1005  if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1006  allocate_device<int4>(&d_migrationDestination, numAtomsHomeAndProxyAllocated);
1007 }
1008 
1009 void SequencerCUDA::deallocateArrays() {
1010  if (numAtomsHomeAndProxyAllocated != 0) {
1011  cudaCheck(cudaSetDevice(deviceID));
1012 
1013  deallocate_host<double>(&f_normal_x);
1014  deallocate_host<double>(&f_normal_y);
1015  deallocate_host<double>(&f_normal_z);
1016  if (simParams->colvarsOn || simParams->tclForcesOn || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
1017  deallocate_host<double>(&f_global_x);
1018  deallocate_host<double>(&f_global_y);
1019  deallocate_host<double>(&f_global_z);
1020  }
1021  deallocate_host<double>(&f_nbond_x);
1022  deallocate_host<double>(&f_nbond_y);
1023  deallocate_host<double>(&f_nbond_z);
1024  deallocate_host<double>(&f_slow_x);
1025  deallocate_host<double>(&f_slow_y);
1026  deallocate_host<double>(&f_slow_z);
1027  deallocate_host<double>(&pos_x);
1028  deallocate_host<double>(&pos_y);
1029  deallocate_host<double>(&pos_z);
1030  deallocate_host<float>(&charge);
1031  deallocate_host<int>(&sortOrder);
1032  deallocate_host<int>(&unsortOrder);
1033  deallocate_host<double>(&recipMass);
1034  deallocate_host<double>(&vel_x);
1035  deallocate_host<double>(&vel_y);
1036  deallocate_host<double>(&vel_z);
1037  deallocate_host<char3>(&transform);
1038  deallocate_host<float>(&mass);
1039  if (simParams->alchOn) {
1040  deallocate_host<int>(&partition);
1041  }
1042  deallocate_host<float>(&langevinParam);
1043  deallocate_host<float>(&langScalVelBBK2);
1044  deallocate_host<float>(&langScalRandBBK2);
1045 
1046  deallocate_host<int>(&hydrogenGroupSize);
1047  deallocate_host<int>(&atomFixed);
1048  if (simParams->fixedAtomsOn) {
1049  deallocate_host<int>(&groupFixed);
1050  deallocate_host<double>(&fixedPosition_x);
1051  deallocate_host<double>(&fixedPosition_y);
1052  deallocate_host<double>(&fixedPosition_z);
1053  }
1054 
1055  deallocate_host<float>(&rigidBondLength);
1056 
1057  deallocate_device<double>(&d_f_raw);
1058  deallocate_device<double>(&d_pos_raw);
1059  deallocate_device<float>(&d_charge);
1060  deallocate_device<int>(&d_sortOrder);
1061  deallocate_device<int>(&d_unsortOrder);
1062  if (simParams->alchOn) {
1063  deallocate_device<int>(&d_partition);
1064  }
1065 
1066  deallocate_device<double>(&d_posNew_raw);
1067 
1068  if (simParams->monteCarloPressureOn) {
1069  deallocate_device<double>(&d_f_rawMC);
1070  deallocate_device<double>(&d_pos_rawMC);
1071 
1072  deallocate_host<int>(&id);
1073  deallocate_device<int>(&d_id);
1074  deallocate_device<int>(&d_idOrder);
1075  deallocate_device<int>(&d_moleculeAtom);
1076  deallocate_device<int>(&d_moleculeStartIndex);
1077  }
1078 
1079  if (simParams->useDeviceMigration) {
1080  deallocate_host<int>(&idMig);
1081  deallocate_host<int>(&vdwType);
1082  deallocate_device<int>(&d_idMig);
1083  deallocate_device<int>(&d_vdwType);
1084  deallocate_device<FullAtom>(&d_atomdata_AoS);
1085  deallocate_device<int>(&d_migrationGroupSize);
1086  deallocate_device<int>(&d_migrationGroupIndex);
1087  deallocate_device<int>(&d_sortIndex);
1088  }
1089  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal || (simParams->IMDon && ! (simParams->IMDignore || simParams->IMDignoreForces))){
1090  deallocate_device<double>(&d_f_global_x);
1091  deallocate_device<double>(&d_f_global_y);
1092  deallocate_device<double>(&d_f_global_z);
1093  }
1094  deallocate_device<double>(&d_vel_x);
1095  deallocate_device<double>(&d_vel_y);
1096  deallocate_device<double>(&d_vel_z);
1097  deallocate_device<double>(&d_recipMass);
1098  deallocate_device<char3>(&d_transform);
1099  deallocate_device<double>(&d_velNew_x);
1100  deallocate_device<double>(&d_velNew_y);
1101  deallocate_device<double>(&d_velNew_z);
1102  deallocate_device<double>(&d_posSave_x);
1103  deallocate_device<double>(&d_posSave_y);
1104  deallocate_device<double>(&d_posSave_z);
1105  deallocate_device<double>(&d_rcm_x);
1106  deallocate_device<double>(&d_rcm_y);
1107  deallocate_device<double>(&d_rcm_z);
1108  deallocate_device<double>(&d_vcm_x);
1109  deallocate_device<double>(&d_vcm_y);
1110  deallocate_device<double>(&d_vcm_z);
1111  deallocate_device<float>(&d_mass);
1112  deallocate_device<float>(&d_langevinParam);
1113  deallocate_device<float>(&d_langScalVelBBK2);
1114  deallocate_device<float>(&d_langScalRandBBK2);
1115  deallocate_device<float>(&d_gaussrand_x);
1116  deallocate_device<float>(&d_gaussrand_y);
1117  deallocate_device<float>(&d_gaussrand_z);
1118  deallocate_device<int>(&d_hydrogenGroupSize);
1119  deallocate_device<float>(&d_rigidBondLength);
1120  deallocate_device<int>(&d_atomFixed);
1121  if (simParams->fixedAtomsOn) {
1122  deallocate_device<int>(&d_groupFixed);
1123  deallocate_device<double>(&d_fixedPosition_x);
1124  deallocate_device<double>(&d_fixedPosition_y);
1125  deallocate_device<double>(&d_fixedPosition_z);
1126  }
1127  deallocate_device<double>(&d_f_saved_nbond_x);
1128  deallocate_device<double>(&d_f_saved_nbond_y);
1129  deallocate_device<double>(&d_f_saved_nbond_z);
1130  deallocate_device<double>(&d_f_saved_slow_x);
1131  deallocate_device<double>(&d_f_saved_slow_y);
1132  deallocate_device<double>(&d_f_saved_slow_z);
1133  }
1134 }
1135 
1136 void SequencerCUDA::deallocateStaticArrays() {
1137  cudaCheck(cudaSetDevice(deviceID));
1138 
1139  deallocate_host<cudaTensor>(&extVirial);
1140  deallocate_host<double3>(&extForce);
1141  deallocate_host<double>(&extEnergy);
1142  deallocate_host<unsigned int>(&h_marginViolations);
1143  deallocate_host<unsigned int>(&h_periodicCellSmall);
1144 
1145  // XXX TODO: Deallocate the additional arrays we added for shmem version
1146  deallocate_host<double3>(&awayDists);
1147  deallocate_host<double3>(&patchMin);
1148  deallocate_host<double3>(&patchMax);
1149  deallocate_host<CudaAtom*>(&cudaAtomLists);
1150  deallocate_host<double3>(&patchCenter);
1151  deallocate_host<int>(&globalToLocalID);
1152  deallocate_host<int>(&patchToDeviceMap);
1153  deallocate_host<Lattice>(&pairlist_lattices);
1154  deallocate_host<double>(&patchMaxAtomMovement);
1155  deallocate_host<double>(&patchNewTolerance);
1156  deallocate_host<CudaMInfo>(&mInfo);
1157  deallocate_host<bool*>(&h_patchRecordHasForces);
1158 
1159  deallocate_host<cudaTensor>(&lpVirialNormal);
1160  deallocate_host<cudaTensor>(&lpVirialNbond);
1161  deallocate_host<cudaTensor>(&lpVirialSlow);
1162 
1163  deallocate_device<double3>(&d_awayDists);
1164  deallocate_device<double3>(&d_patchMin);
1165  deallocate_device<double3>(&d_patchMax);
1166  deallocate_device<int>(&d_globalToLocalID);
1167  deallocate_device<int>(&d_patchToDeviceMap);
1168  deallocate_device<int>(&d_sortOrder);
1169  deallocate_device<int>(&d_unsortOrder);
1170  deallocate_device<double3>(&d_patchCenter);
1171  deallocate_device<Lattice>(&d_lattices);
1172  deallocate_device<Lattice>(&d_pairlist_lattices);
1173  deallocate_device<double>(&d_patchMaxAtomMovement);
1174  deallocate_device<double>(&d_patchNewTolerance);
1175  deallocate_device<CudaMInfo>(&d_mInfo);
1176 
1177  deallocate_device<int>(&d_killme);
1178  deallocate_device<char>(&d_barrierFlag);
1179  deallocate_device<unsigned int>(&d_tbcatomic);
1180  deallocate_device<BigReal>(&d_kineticEnergy);
1181  deallocate_device<BigReal>(&d_intKineticEnergy);
1182  deallocate_device<BigReal>(&d_momentum_x);
1183  deallocate_device<BigReal>(&d_momentum_y);
1184  deallocate_device<BigReal>(&d_momentum_z);
1185  deallocate_device<BigReal>(&d_angularMomentum_x);
1186  deallocate_device<BigReal>(&d_angularMomentum_y);
1187  deallocate_device<BigReal>(&d_angularMomentum_z);
1188  deallocate_device<cudaTensor>(&d_virial);
1189  deallocate_device<cudaTensor>(&d_intVirialNormal);
1190  deallocate_device<cudaTensor>(&d_intVirialNbond);
1191  deallocate_device<cudaTensor>(&d_intVirialSlow);
1192  deallocate_device<cudaTensor>(&d_lpVirialNormal);
1193  deallocate_device<cudaTensor>(&d_lpVirialNbond);
1194  deallocate_device<cudaTensor>(&d_lpVirialSlow);
1195  deallocate_device<cudaTensor>(&d_rigidVirial);
1196  deallocate_device<cudaTensor>(&d_extVirial);
1197  deallocate_device<double3>(&d_extForce);
1198  deallocate_device<double>(&d_extEnergy);
1199  deallocate_device<SettleParameters>(&sp);
1200  deallocate_device<unsigned int>(&deviceQueue);
1201  deallocate_device<double*>(&d_peer_pos_x);
1202  deallocate_device<double*>(&d_peer_pos_y);
1203  deallocate_device<double*>(&d_peer_pos_z);
1204  deallocate_device<float*>(&d_peer_charge);
1205  deallocate_device<int*>(&d_peer_partition);
1206  deallocate_device<double*>(&d_peer_vel_x);
1207  deallocate_device<double*>(&d_peer_vel_y);
1208  deallocate_device<double*>(&d_peer_vel_z);
1209  deallocate_device<double*>(&d_peer_fb_x);
1210  deallocate_device<double*>(&d_peer_fb_y);
1211  deallocate_device<double*>(&d_peer_fb_z);
1212  deallocate_device<double*>(&d_peer_fn_x);
1213  deallocate_device<double*>(&d_peer_fn_y);
1214  deallocate_device<double*>(&d_peer_fn_z);
1215  deallocate_device<double*>(&d_peer_fs_x);
1216  deallocate_device<double*>(&d_peer_fs_y);
1217  deallocate_device<double*>(&d_peer_fs_z);
1218 
1219  deallocate_device<bool*>(&d_patchRecordHasForces);
1220 
1221  deallocate_device(&d_fixVirialNormal);
1222  deallocate_device(&d_fixVirialNbond);
1223  deallocate_device(&d_fixVirialSlow);
1224  deallocate_device(&d_fixForceNormal);
1225  deallocate_device(&d_fixForceNbond);
1226  deallocate_device(&d_fixForceSlow);
1227 
1228  if (simParams->useDeviceMigration) {
1229  deallocate_device<FullAtom>(&d_atomdata_AoS_in);
1230  deallocate_device<int>(&d_sortSoluteIndex);
1231  if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1232  }
1233 
1234  deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
1235 }
1236 
1237 void SequencerCUDA::copyMigrationInfo(HomePatch *p, int patchIndex){
1238  CudaMInfo &m = mInfo[patchIndex];
1239  if (!p->patchMapRead) p->readPatchMap();
1240  for(int x = 0; x < 3; x++){
1241  for(int y = 0; y < 3; y++){
1242  for(int z = 0; z < 3; z++){
1243  // copies migration info over
1244  MigrationInfo *hm = p->mInfo[x][y][z];
1245  if(hm != NULL) m.destPatchID[x][y][z] = hm->destPatchID;
1246  else m.destPatchID[x][y][z] = -1; // let's flag this as -1 for now
1247  }
1248  }
1249  }
1250  if (simParams->useDeviceMigration) {
1251  m.destPatchID[1][1][1] = p->getPatchID();
1252  }
1253 }
1254 
1255 void SequencerCUDA::assembleOrderedPatchList(){
1256  // Assembles the patches on each Pe sharing a device into a device-ordered list
1257  patchList.clear();
1258 
1259  // Handle our own patches
1260  for (int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
1261  HomePatch *p = patchData->devData[deviceIndex].patches[i];
1262  patchList.push_back(p);
1263  }
1264 
1265 
1266  // Do we really need this?
1267 #if 1
1268  patchListHomeAndProxy.clear();
1269  // Set up list of device patches. We need the home patches for everyone
1270  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1271  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1272  const int patchID = localPatches[i].patchID;
1273 
1274 
1275  for(int d = 0; d < CkNumPes(); d++){
1276  PatchMap* pm = PatchMap::ObjectOnPe(d);
1277  HomePatch *patch = pm->homePatch(patchID);
1278  if(patch != NULL) {
1279  patchListHomeAndProxy.push_back(patch);
1280  }
1281  }
1282  }
1283 
1284 #endif
1285 }
1286 
1294 void SequencerCUDA::copyAoSDataToHost() {
1295  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1296  patchData = cpdata.ckLocalBranch();
1297 
1298  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1299  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1300 
1301  CUDAMigrationKernel->update_AoS(
1302  numPatchesHome,
1303  patchData->devData[deviceIndex].d_localPatches,
1304  (FullAtom*) d_atomdata_AoS,
1305  d_vel_x, d_vel_y, d_vel_z,
1306  d_pos_x, d_pos_y, d_pos_z,
1307  stream
1308  );
1309 
1310  for (int i = 0; i < integrationPatches.size(); i++) {
1311  const int numAtoms = localPatches[i].numAtoms;
1312  const int offset = localPatches[i].bufferOffset;
1313  HomePatch *patch = integrationPatches[i];
1314  patch->updateAtomCount(numAtoms, false);
1315  patch->updateAtomBuffers();
1316  FullAtomList& h_atomdata = patch->getAtomList();
1317  copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (FullAtom*)h_atomdata.begin(), numAtoms, stream);
1318  }
1319  cudaCheck(cudaStreamSynchronize(stream));
1320 }
1321 
1322 
1329 void SequencerCUDA::migrationLocalInit() {
1330  CUDAMigrationKernel->computeMigrationGroupIndex(
1331  numPatchesHome,
1332  patchData->devData[deviceIndex].d_localPatches,
1333  d_migrationGroupSize,
1334  d_migrationGroupIndex,
1335  stream
1336  );
1337 
1338  CUDAMigrationKernel->update_AoS(
1339  numPatchesHome,
1340  patchData->devData[deviceIndex].d_localPatches,
1341  (FullAtom*) d_atomdata_AoS,
1342  d_vel_x, d_vel_y, d_vel_z,
1343  d_pos_x, d_pos_y, d_pos_z,
1344  stream
1345  );
1346 
1347  CUDAMigrationKernel->computeMigrationDestination(
1348  numPatchesHome,
1349  patchData->devData[deviceIndex].d_localPatches,
1350  myLattice,
1351  d_mInfo,
1352  d_patchToDeviceMap,
1353  d_globalToLocalID,
1354  d_patchMin,
1355  d_patchMax,
1356  d_hydrogenGroupSize,
1357  d_migrationGroupSize,
1358  d_migrationGroupIndex,
1359  d_pos_x, d_pos_y, d_pos_z,
1360  d_migrationDestination,
1361  stream
1362  );
1363 
1364  CUDAMigrationKernel->performLocalMigration(
1365  numPatchesHome,
1366  patchData->devData[deviceIndex].d_localPatches,
1367  (FullAtom*) d_atomdata_AoS,
1368  (FullAtom*) d_atomdata_AoS_in,
1369  d_migrationDestination,
1370  stream
1371  );
1372 
1373  cudaCheck(cudaStreamSynchronize(stream));
1374 }
1375 
1380 void SequencerCUDA::migrationPerform() {
1381  CUDAMigrationKernel->performMigration(
1382  numPatchesHome,
1383  patchData->devData[deviceIndex].d_localPatches,
1384  d_peer_record,
1385  (FullAtom*) d_atomdata_AoS,
1386  d_peer_atomdata,
1387  d_migrationGroupSize,
1388  d_migrationGroupIndex,
1389  d_migrationDestination,
1390  stream
1391  );
1392  cudaCheck(cudaStreamSynchronize(stream));
1393 }
1394 
1395 
1396 void SequencerCUDA::migrationUpdateAtomCounts() {
1397  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1398  patchData = cpdata.ckLocalBranch();
1399 
1400  CUDAMigrationKernel->updateLocalRecords(
1401  numPatchesHome,
1402  patchData->devData[deviceIndex].d_localPatches,
1403  d_peer_record,
1404  patchData->devData[deviceIndex].d_peerPatches,
1405  stream
1406  );
1407 
1408  cudaCheck(cudaStreamSynchronize(stream));
1409 }
1410 
1411 void SequencerCUDA::migrationUpdateAtomOffsets() {
1412  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1413  patchData = cpdata.ckLocalBranch();
1414 
1415  CUDAMigrationKernel->updateLocalRecordsOffset(
1416  numPatchesHomeAndProxy,
1417  patchData->devData[deviceIndex].d_localPatches,
1418  stream
1419  );
1420 
1421  cudaCheck(cudaStreamSynchronize(stream));
1422 }
1423 
1424 void SequencerCUDA::migrationUpdateRemoteOffsets() {
1425  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1426  patchData = cpdata.ckLocalBranch();
1427 
1428  CUDAMigrationKernel->updatePeerRecords(
1429  numPatchesHomeAndProxy,
1430  patchData->devData[deviceIndex].d_localPatches,
1431  d_peer_record,
1432  patchData->devData[deviceIndex].d_peerPatches,
1433  stream
1434  );
1435 
1436  cudaCheck(cudaStreamSynchronize(stream));
1437 }
1438 
1439 void SequencerCUDA::migrationUpdateProxyDestination() {
1440  if (mGpuOn) {
1441  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1442  patchData = cpdata.ckLocalBranch();
1443 
1444  // This is implemented as a put instead of a get to avoid the need
1445  // for a node synchronization.
1446  CUDAMigrationKernel->copyMigrationDestinationToProxies(
1447  deviceIndex,
1448  numPatchesHome,
1449  numPatchesHomeAndProxy,
1450  patchData->devData[deviceIndex].d_localPatches,
1451  patchData->devData[deviceIndex].d_peerPatches,
1452  d_peer_migrationDestination,
1453  stream
1454  );
1455  }
1456 }
1457 
1458 void SequencerCUDA::copyPatchDataToHost() {
1459  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1460  patchData = cpdata.ckLocalBranch();
1461  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1462 
1463  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1464  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1465 
1466  copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
1467  cudaCheck(cudaStreamSynchronize(stream));
1468 
1469 
1470  // Update Atom Counts
1471  for (int i = 0; i < numPatchesHome; i++) {
1472  HomePatch* hp = integrationPatches[i];
1473  hp->updateAtomCount(localPatches[i].numAtoms, false);
1474  }
1475  cudaCheck(cudaStreamSynchronize(stream));
1476 }
1477 
1478 // This code path is used only by device migration
1479 void SequencerCUDA::copyAtomDataToDeviceAoS() {
1480  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1481  patchData = cpdata.ckLocalBranch();
1482 
1483  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1484  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1485  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1486 
1487 
1488  for (int i = 0; i < integrationPatches.size(); i++) {
1489  const int numAtoms = localPatches[i].numAtoms;
1490  if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
1491  iout << iERROR << "The number of atoms in patch " << i << " is "
1492  << numAtoms << ", greater than the limit for GPU atom migration ("
1493  << MigrationCUDAKernel::kMaxAtomsPerPatch << ").\n" << endi;
1494  NAMD_bug("NAMD has stopped simulating due to the error above, "
1495  "but you could disable GPUAtomMigration and try again.\n");
1496  }
1497  const int offset = localPatches[i].bufferOffset;
1498  HomePatch *patch = integrationPatches[i];
1499  FullAtomList& h_atomdata = patch->getAtomList();
1500  copy_HtoD<FullAtom>((FullAtom*)h_atomdata.begin(), d_atomdata_AoS_in + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
1501  }
1502  cudaCheck(cudaStreamSynchronize(stream));
1503 }
1504 
1505 /*
1506  * Aggregates and copys various data from the host to device
1507  *
1508  * Some data is only copied for home patches, while other data is copied for
1509  * home and proxy patches
1510  *
1511  */
1512 void SequencerCUDA::copyAtomDataToDevice(bool copyForces, int maxForceNumber) {
1513 
1514  AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass, double, stream);
1515  if(copyForces){
1516  switch (maxForceNumber) {
1517  case 2:
1518  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_x, double, stream);
1519  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_y, double, stream);
1520  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_z, double, stream);
1521  case 1:
1522  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_x, double, stream);
1523  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_y, double, stream);
1524  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_z, double, stream);
1525  case 0:
1526  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_x, double, stream);
1527  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_y, double, stream);
1528  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_z, double, stream);
1529  }
1530  }
1531 
1532  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_x, double, stream);
1533  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_y, double, stream);
1534  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_z, double, stream);
1535  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_x, double, stream);
1536  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_y, double, stream);
1537  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_z, double, stream);
1538  AGGREGATE_HOME_ATOMS_TO_DEVICE(mass, float, stream);
1539  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(charge, float, stream);
1540  if (simParams->alchOn) {
1541  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(partition, int, stream);
1542  }
1543 
1544  if (simParams->langevinOn) {
1545  AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam, float, stream);
1546  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2, float, stream);
1547  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2, float, stream);
1548  }
1549 
1550  AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize, int, stream);
1551  AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed, int, stream);
1552  if (simParams->fixedAtomsOn) {
1553  AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed, int, stream);
1554  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x, double, stream);
1555  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y, double, stream);
1556  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z, double, stream);
1557  }
1558  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(sortOrder, int, stream);
1559  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(unsortOrder, int, stream);
1560  AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength, float, stream);
1561 
1562  if (simParams->monteCarloPressureOn) {
1563  AGGREGATE_HOME_ATOMS_TO_DEVICE(id, int, stream);
1564  //set up initial mapping for global index to local array index
1565  CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
1566  }
1567 }
1568 
1569 
1570 void SequencerCUDA::migrationLocalPost(int startup) {
1571  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1572  patchData = cpdata.ckLocalBranch();
1573 
1574  if (simParams->useDeviceMigration) {
1575  if (!startup) {
1576  CUDAMigrationKernel->transformMigratedPositions(
1577  numPatchesHome,
1578  patchData->devData[deviceIndex].d_localPatches,
1579  d_patchCenter,
1580  (FullAtom*) d_atomdata_AoS_in,
1581  myLattice,
1582  stream
1583  );
1584  }
1585 
1586  // sort solvent/solute data
1587  CUDAMigrationKernel->sortSolventAtoms(
1588  numPatchesHome,
1589  patchData->devData[deviceIndex].d_localPatches,
1590  (FullAtom*) d_atomdata_AoS_in,
1591  (FullAtom*) d_atomdata_AoS,
1592  d_sortSoluteIndex,
1593  stream
1594  );
1595 
1596  double dt = 1.0;
1597  double kbT = 1.0;
1598  double tempFactor = 1.0;
1599  if (simParams->langevinOn) {
1600  dt = simParams->dt * 0.001; // convert timestep to ps
1601  kbT = BOLTZMANN * simParams->langevinTemp;
1602  int lesReduceTemp = (simParams->lesOn && simParams->lesReduceTemp);
1603  tempFactor = (lesReduceTemp ? 1. / simParams->lesFactor : 1);
1604  }
1605  CUDAMigrationKernel->copy_AoS_to_SoA(
1606  numPatchesHome, simParams->alchOn,
1607  simParams->langevinOn, dt, kbT, tempFactor,
1608  patchData->devData[deviceIndex].d_localPatches,
1609  (FullAtom*) d_atomdata_AoS,
1610  d_recipMass,
1611  d_vel_x, d_vel_y, d_vel_z,
1612  d_pos_x, d_pos_y, d_pos_z,
1613  d_mass, d_charge,
1614  d_idMig, d_vdwType,
1615  d_hydrogenGroupSize, d_migrationGroupSize,
1616  d_atomFixed,
1617  d_rigidBondLength,
1618  d_transform,
1619  d_partition,
1620  d_langevinParam,
1621  d_langScalVelBBK2,
1622  d_langScalRandBBK2,
1623  stream
1624  );
1625  }
1626 
1627  // Other migration post processing steps
1628  if (simParams->useDeviceMigration) {
1629  if (simParams->monteCarloPressureOn) {
1630  CUDASequencerKernel->SetAtomIndexOrder(d_idMig, d_idOrder, numAtomsHome, stream);
1631  }
1632  }
1633 
1634  // JM: Saving position to these doubles for pairListCheck
1635  copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
1636  copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
1637  copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
1638 
1639  // JM NOTE: We need to save the lattice at the beggining of the cycle
1640  // in order to use it in SequencerCUDAKernel::pairlistCheck();
1641  myLatticeOld = myLattice;
1642 
1643  // JM NOTE: Recalculates the centers of mass since we have new positions
1644  CUDASequencerKernel->centerOfMass(
1645  d_pos_x, d_pos_y, d_pos_z,
1646  d_rcm_x, d_rcm_y, d_rcm_z,
1647  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1648  CUDASequencerKernel->centerOfMass(
1649  d_vel_x, d_vel_y, d_vel_z,
1650  d_vcm_x, d_vcm_y, d_vcm_z,
1651  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1652 
1653  cudaCheck(cudaStreamSynchronize(stream));
1654 }
1655 
1656 void SequencerCUDA::migrationUpdateAdvancedFeatures(const int startup) {
1657  if(simParams->eFieldOn || simParams->SMDOn || simParams->groupRestraintsOn ||
1658  simParams->monteCarloPressureOn || simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn ||
1659  simParams->colvarsOn ||
1660  simParams->useCudaGlobal ||
1661  simParams->tclForcesOn){
1662 
1663  // Handcopies the transform field in a char* SOA
1664  // JM NOTE: We're copying ints into char because "transform" is an int
1665  // field in PatchDataSOA but a signed char in FullAtom
1666  size_t offset = 0;
1667  for (int i = 0; i < numPatchesHome; i++) {
1668  PatchDataSOA& current = patchList[i]->patchDataSOA;
1669  const int numPatchAtoms = current.numAtoms;
1670  // memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type));
1671  for(int j = 0; j < numPatchAtoms; j++){
1672  transform[offset + j].x = current.transform_i[j];
1673  transform[offset + j].y = current.transform_j[j];
1674  transform[offset + j].z = current.transform_k[j];
1675  }
1676  offset += numPatchAtoms;
1677  }
1678  copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
1679  }
1680 
1681  if (!startup) {
1683  lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
1684  }
1685  if(simParams->constraintsOn) {
1686  restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1687  }
1688  if(simParams->SMDOn) {
1689  SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1690  }
1691  if(simParams->groupRestraintsOn) {
1692  groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1693  }
1694  if(simParams->mgridforceOn || simParams->gridforceOn){
1695 
1696  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
1697  }
1698  if(simParams->consForceOn) {
1699  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1700  }
1701 
1702  }
1703 }
1704 
1705 void SequencerCUDA::migrationUpdateDestination() {
1706  CUDAMigrationKernel->updateMigrationDestination(
1707  numAtomsHomePrev,
1708  d_migrationDestination,
1709  d_peer_sortSoluteIndex,
1710  stream
1711  );
1712 }
1713 
1714 bool SequencerCUDA::copyPatchData(
1715  const bool copyIn,
1716  const bool startup
1717 ) {
1718  bool reallocated = false;
1719  if (copyIn) {
1720  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1721  patchData = cpdata.ckLocalBranch();
1722 
1723  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1724 
1725  std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1726  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1727 
1728  if (startup) {
1729  // numPatchesHomeAndProxy is set when the patch data is constructed
1730  numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1731  numPatchesHome = homePatches.size();
1732  patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;
1733 
1734  if (simParams->useDeviceMigration) {
1735  // Padded data structures which will not be reallocated
1736  allocate_device<FullAtom>(&d_atomdata_AoS_in, ((int64_t) numPatchesHome) * MigrationCUDAKernel::kMaxAtomsPerPatch);
1737  allocate_device<int>(&d_sortSoluteIndex, numPatchesHome * MigrationCUDAKernel::kMaxAtomsPerPatch);
1738  d_migrationDestination = NULL;
1739  }
1740 #if defined(NAMD_HIP)
1741  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_localPatches,
1742  sizeof(CudaLocalRecord)*numPatchesHomeAndProxy,
1743  hipDeviceMallocFinegrained);
1744  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_peerPatches,
1745  sizeof(CudaLocalRecord)*peerPatches.size(),
1746  hipDeviceMallocFinegrained);
1747 #else
1748  allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
1749  allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
1750 #endif
1751  if (simParams->useDeviceMigration) {
1752  CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
1753  }
1754 
1755  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1756  numPatchesHomeAndProxy, stream);
1757  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1758  peerPatches.size(), stream);
1759  if(true || mGpuOn) {
1760  this->assembleOrderedPatchList();
1761  }
1762  this->copySettleParameter();
1763 
1764  for (int i = 0; i < numPatchesHome; i++) {
1765  HomePatch *patch = homePatches[i];
1766  this->copyMigrationInfo(patch, i);
1767  patchNewTolerance[i] = 0.5 * ( simParams->pairlistDist - simParams->cutoff);
1768 
1769  globalToLocalID[patch->getPatchID()] = i;
1770  patchToDeviceMap[patch->getPatchID()] = deviceIndex;
1771  }
1772  copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
1773  copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);
1774 
1775  // Need the globalToLocalID and patchToDeviceMap data structures to be system wide for migration
1776  // They are also used in tuple migration, so we add them to patchData, so they can be easily
1777  // accessed elsewhere
1778  for (int i = 0; i < deviceCUDA->getNumDevice(); i++) {
1779  if (i == deviceIndex) continue;
1780  std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
1781  for (int j = 0; j < otherPatches.size(); j++) {
1782  HomePatch *patch = otherPatches[j];
1783  globalToLocalID[patch->getPatchID()] = j;
1784  patchToDeviceMap[patch->getPatchID()] = i;
1785  }
1786  }
1787  copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
1788  copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
1789  patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
1790  patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;
1791 
1792  // Allocate more data
1793  allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
1794  }
1795 
1796  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1797  HomePatch *patch = patchListHomeAndProxy[i];
1798  awayDists[i].x = patch->aAwayDist;
1799  awayDists[i].y = patch->bAwayDist;
1800  awayDists[i].z = patch->cAwayDist;
1801  COPY_CUDAVECTOR(patch->center, patchCenter[i]);
1802  COPY_CUDAVECTOR(patch->getMin(), patchMin[i]);
1803  COPY_CUDAVECTOR(patch->getMax(), patchMax[i]);
1804  }
1805 
1806  copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
1807  copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
1808  copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
1809  copy_HtoD<double3>(patchCenter, d_patchCenter, numPatchesHomeAndProxy, stream);
1810 
1811  const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
1812  localPatches[numPatchesHomeAndProxy-1].numAtoms;
1813 
1814  const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
1815  localPatches[numPatchesHome-1].numAtoms;
1816 
1817  reallocated = reallocateArrays(homeAtomCount, totalAtomCount);
1818 
1819 
1820  numAtomsHomePrev = numAtomsHome;
1821  numAtomsHomeAndProxy = totalAtomCount;
1822  numAtomsHome = homeAtomCount;
1823 
1824  patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;
1825 
1826  if (!startup) {
1827  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1828  numPatchesHomeAndProxy, stream);
1829  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1830  peerPatches.size(), stream);
1831  }
1832  if (startup) {
1833  if (simParams->monteCarloPressureOn) {
1834  //Only at startup, copy the molecule's index info to GPU
1835  Molecule *molecule = Node::Object()->molecule;
1836  copy_HtoD<int>(molecule->moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
1837  copy_HtoD<int>(molecule->moleculeStartIndex, d_moleculeStartIndex, molecule->numMolecules + 1, stream);
1838  }
1839  }
1840  }
1841  return reallocated;
1842 }
1843 
1844 void SequencerCUDA::copyDataToPeers(
1845  const bool copyIn
1846 ) {
1847  if (!copyIn) return;
1848  // Positions will be copied by the kernel
1849  // Forces don't need to be copied
1850  // Atom data needed to be copied: sortOrder, unsortOrder
1851 
1852  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1853  patchData = cpdata.ckLocalBranch();
1854  if (mGpuOn) {
1855  CUDAMigrationKernel->copyDataToProxies(
1856  deviceIndex,
1857  numPatchesHome,
1858  numPatchesHomeAndProxy,
1859  patchData->devData[deviceIndex].d_localPatches,
1860  d_peer_id,
1861  d_peer_vdwType,
1862  d_peer_sortOrder,
1863  d_peer_unsortOrder,
1864  d_peer_charge,
1865  d_peer_partition,
1866  d_peer_patchCenter,
1867  simParams->alchOn,
1868  stream
1869  );
1870  }
1871  cudaCheck(cudaStreamSynchronize(stream));
1872 }
1873 
1874 void SequencerCUDA::migrationSortAtomsNonbonded() {
1875  CUDAMigrationKernel->sortAtoms(
1876  numPatchesHome, numAtomsHome,
1877  patchData->devData[deviceIndex].d_localPatches,
1878  patchMin, patchMax,
1879  d_pos_x, d_pos_y, d_pos_z,
1880  d_sortOrder,
1881  d_unsortOrder,
1882  d_sortIndex,
1883  stream
1884  );
1885 }
1886 
1887 void SequencerCUDA::maximumMove(
1888  const double maxvel2,
1889  const int numAtoms)
1890 {
1891  CUDASequencerKernel->maximumMove(
1892  maxvel2, d_vel_x, d_vel_y, d_vel_z,
1893  killme, numAtoms, stream);
1894 }
1895 
1896 void SequencerCUDA::submitHalf(
1897  int numAtoms, int part, const bool doEnergy)
1898 {
1899  //BigReal kineticEnergy;
1900  Tensor reduction_virial;
1901  //cudaTensor h_virial;
1902  //BigReal intKineticEnergy;
1903  Tensor reduction_intVirialNormal;
1904  //cudaTensor h_intVirialNormal;
1905  int hgs;
1906 
1907  if(doEnergy){
1908 #if 0
1909  cudaCheck(cudaEventRecord(eventStart,stream));
1910 #endif
1911  CUDASequencerKernel->submitHalf(
1912  simParams->fixedAtomsOn,
1913  d_vel_x, d_vel_y, d_vel_z,
1914  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
1915  d_kineticEnergy, d_intKineticEnergy,
1916  d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
1917  virial_half, intVirialNormal_half,
1918  d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
1919 #if 0
1920  cudaCheck(cudaEventRecord(eventStop, stream));
1921  cudaCheck(cudaEventSynchronize(eventStop));
1922  cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
1923  fprintf(stderr, "submitHalf total elapsed time: %f\n", t_submitHalf);
1924  t_submitReductions2 = 0;
1925 #endif
1926  }
1927 }
1928 
1929 void SequencerCUDA::submitReductions(
1930  BigReal origin_x,
1931  BigReal origin_y,
1932  BigReal origin_z,
1933  int marginViolations,
1934  int doEnergy,
1935  int doMomentum,
1936  int numAtomsReduction,
1937  int maxForceNumber)
1938 {
1939  // reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsReduction; //moved to launch_part2, startRun2
1940  // where do I get the margin violations?
1941  // reduction->item(REDUCTION_MARGIN_VIOLATIONS) += marginViolations;
1942  if(doEnergy){
1943  if(doMomentum){
1944  // JM NOTE: Calculates momenta if copyOut
1945  CUDASequencerKernel->submitReduction1(
1946  d_pos_x, d_pos_y, d_pos_z,
1947  d_vel_x, d_vel_y, d_vel_z, d_mass,
1948  d_kineticEnergy,
1949  d_momentum_x, d_momentum_y, d_momentum_z,
1950  d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
1951  origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
1952  momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
1953  numAtomsReduction, stream);
1954  }
1955  Tensor regintVirialNormal;
1956  Tensor regintVirialNbond;
1957  Tensor regintVirialSlow;
1958 
1959 #if 0
1960  cudaCheck(cudaEventRecord(eventStart,stream));
1961 #endif
1962  CUDASequencerKernel->submitReduction2(
1963  simParams->fixedAtomsOn, d_atomFixed,
1964  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
1965  d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
1966  d_f_normal_x, d_f_normal_y, d_f_normal_z,
1967  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1968  d_f_slow_x, d_f_slow_y, d_f_slow_z,
1969  d_mass, d_hydrogenGroupSize,
1970  d_kineticEnergy, kineticEnergy,
1971  d_intKineticEnergy, intKineticEnergy,
1972  d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
1973  intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
1974  d_tbcatomic, numAtomsReduction, maxForceNumber, simParams->isMultiTimeStepping(), stream);
1975  // Haochuan: actually should be doVirial here
1976  if (simParams->fixedAtomsOn) {
1977  CUDASequencerKernel->calcFixVirial(
1978  maxForceNumber, numAtomsReduction, d_atomFixed,
1979  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
1980  d_f_normal_x, d_f_normal_y, d_f_normal_z,
1981  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1982  d_f_slow_x, d_f_slow_y, d_f_slow_z,
1983  d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
1984  d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
1985  }
1986  }
1987 #if 0
1988  cudaCheck(cudaEventRecord(eventStop, stream));
1989  cudaCheck(cudaEventSynchronize(eventStop));
1990  cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
1991  fprintf(stderr, "submitReductions2 total elapsed time: %f\n", t_submitReductions2);
1992  t_submitReductions2 = 0;
1993 #endif
1994 }
1995 
1996 void SequencerCUDA::copySettleParameter(){
1997  // Searching for a patch that contains initialized settle parameters
1998  cudaCheck(cudaSetDevice(deviceID));
1999  if(simParams->rigidBonds != RIGID_NONE){
2000  HomePatch *patch = NULL;
2001  // PatchList contains all patches in the node, so if there's a single water in the system,
2002  // this is guaranteed to catch it
2003  for(int i = 0; i < patchList.size(); i++){
2004  if(patchList[i]->settle_initialized) {
2005  patch = patchList[i];
2006  break;
2007  }
2008  }
2009  if ( patch ) {
2010  SettleParameters h_sp;
2011  h_sp.mO = patch->settle_mO;
2012  h_sp.mH = patch->settle_mH;
2013  h_sp.mOrmT = patch->settle_mOrmT;
2014  h_sp.mHrmT = patch->settle_mHrmT;
2015  h_sp.rra = patch->settle_rra;
2016  h_sp.ra = patch->settle_ra;
2017  h_sp.rb = patch->settle_rb;
2018  h_sp.rc = patch->settle_rc;
2019  h_sp.r_om = patch->r_om;
2020  h_sp.r_ohc = patch->r_ohc;
2021 
2022  // fprintf(stderr, "SETTLEPARAMETER Found: Values %lf %lf %lf %lf %lf %lf %lf %lf\n",
2023  // h_sp.mO, h_sp.mH, h_sp.mOrmT, h_sp.mHrmT, h_sp.rra, h_sp.ra, h_sp.rb, h_sp.rc );
2024  copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
2025  }
2026  }
2027 }
2028 
2029 // Does rattle1_SOA
2030 void SequencerCUDA::startRun1(
2031  int maxForceNumber,
2032  const Lattice& lattice
2033 ) {
2034  // cudaCheck(cudaSetDevice(deviceID));
2035  myLattice = lattice;
2036 
2037  // JM: Enforcing rigid bonds on first iteration
2038  CUDASequencerKernel->rattle1(
2039  simParams->fixedAtomsOn, 1, 0,
2040  numAtomsHome, 0.f, 0.f,
2041  2.0 * simParams->rigidTol,
2042  d_vel_x, d_vel_y, d_vel_z,
2043  d_pos_x, d_pos_y, d_pos_z,
2044  d_velNew_x, d_velNew_y, d_velNew_z,
2045  d_posNew_x, d_posNew_y, d_posNew_z,
2046  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2047  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2048  &settleList, settleListSize, &d_consFailure,
2049  d_consFailureSize, &rattleList, rattleListSize,
2050  &nSettle, &nRattle,
2051  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2052  buildRigidLists, consFailure, simParams->watmodel, stream);
2053 
2054  this->copyPositionsAndVelocitiesToHost(1, 0);
2055  cudaCheck(cudaDeviceSynchronize());
2056 #if 0
2057  printSOAPositionsAndVelocities();
2058 #endif
2059 }
2060 
2061 void SequencerCUDA::startRun2(
2062  double dt_normal,
2063  double dt_nbond,
2064  double dt_slow,
2065  Vector origin,
2066  int doGlobal,
2067  int maxForceNumber
2068 ){
2069  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2070 
2071  // This is a patch-based kernel -> which means each threadblock deals with an entire patch
2072  // So, in the multigpu case, we want to keep non-offset pointers but we want
2073  // to deal only with a handful of patches
2074 
2075  // We dont need to and we should not set normal force to zero
2076  // It stores global forces. Additionally, we set normall forces,
2077  // every step, it's not an addition
2078  // cudaCheck(cudaMemset(d_f_normal_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2079  // cudaCheck(cudaMemset(d_f_normal_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2080  // cudaCheck(cudaMemset(d_f_normal_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2081 
2082  cudaCheck(cudaMemset(d_f_nbond_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2083  cudaCheck(cudaMemset(d_f_nbond_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2084  cudaCheck(cudaMemset(d_f_nbond_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2085 
2086  cudaCheck(cudaMemset(d_f_slow_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2087  cudaCheck(cudaMemset(d_f_slow_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2088  cudaCheck(cudaMemset(d_f_slow_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2089 
2090  CUDASequencerKernel->accumulateForceToSOA(
2091  doGlobal,
2092  simParams->useCudaGlobal,
2093  maxForceNumber,
2094  numPatchesHomeAndProxy,
2095  nDevices,
2096  patchData->devData[deviceIndex].d_localPatches,
2097  patchData->devData[deviceIndex].f_bond,
2098  patchData->devData[deviceIndex].f_bond_nbond,
2099  patchData->devData[deviceIndex].f_bond_slow,
2100  patchData->devData[deviceIndex].forceStride,
2101  patchData->devData[deviceIndex].f_nbond,
2102  patchData->devData[deviceIndex].f_nbond_slow,
2103  patchData->devData[deviceIndex].f_slow,
2104  d_f_global_x,
2105  d_f_global_y,
2106  d_f_global_z,
2107  d_f_normal_x,
2108  d_f_normal_y,
2109  d_f_normal_z,
2110  d_f_nbond_x,
2111  d_f_nbond_y,
2112  d_f_nbond_z,
2113  d_f_slow_x,
2114  d_f_slow_y,
2115  d_f_slow_z,
2116  d_unsortOrder,
2117  myLattice,
2118  patchData->d_queues,
2119  patchData->d_queueCounters,
2120  d_tbcatomic,
2121  stream
2122  );
2123  if(mGpuOn){
2124  if(SMDKernel)
2125  {
2126  // compute distributed center of mass for groups of atoms
2127  SMDKernel->computeCOMMGpu(myLattice, d_mass, d_pos_x, d_pos_y, d_pos_z,
2128  d_transform, stream);
2129  }
2130  if(groupRestraintsKernel)
2131  {
2132  groupRestraintsKernel->doCOM_mgpu(myLattice, d_transform,
2133  d_mass, d_pos_x, d_pos_y, d_pos_z,
2134  stream);
2135  }
2136  // Synchonize device before node barrier
2137  cudaCheck(cudaDeviceSynchronize());
2138  }
2139 #if 0
2140  printSOAPositionsAndVelocities();
2141 #endif
2142 }
2143 
2144 void SequencerCUDA::startRun3(
2145  double dt_normal,
2146  double dt_nbond,
2147  double dt_slow,
2148  Vector origin,
2149  const bool requestGlobalForces,
2150  int doGlobalMasterStaleForces,
2151  const bool requestForcesOutput,
2152  const bool requestGlobalForcesGPU,
2153  int maxForceNumber
2154 ){
2155  const bool doFixed = simParams->fixedAtomsOn;
2156  if(mGpuOn){
2157  // XXX TODO we need to call the force merging kernel here
2158 #if 1
2159  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2160  std::vector<int> atom_counts;
2161  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2162  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2163  }
2164  CUDASequencerKernel->mergeForcesFromPeers(
2165  deviceIndex,
2166  maxForceNumber,
2167  myLattice,
2168  numPatchesHomeAndProxy,
2169  numPatchesHome,
2170  this->d_peer_fb_x,
2171  this->d_peer_fb_y,
2172  this->d_peer_fb_z,
2173  this->d_peer_fn_x,
2174  this->d_peer_fn_y,
2175  this->d_peer_fn_z,
2176  this->d_peer_fs_x,
2177  this->d_peer_fs_y,
2178  this->d_peer_fs_z,
2179  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2180  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2181  patchData->devData[deviceIndex].d_localPatches,
2182  patchData->devData[deviceIndex].d_peerPatches,
2183  atom_counts,
2184  stream
2185  );
2186 #else
2187 
2188  // Before I call nccl, let's see the forces here
2189  // ncclAllReduce(d_f_normal_x, d_f_normal_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2190 
2191  // ncclAllReduce(d_f_normal_y, d_f_normal_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2192  // ncclAllReduce(d_f_normal_z, d_f_normal_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2193  // ncclAllReduce(d_f_nbond_x, d_f_nbond_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2194  // ncclAllReduce(d_f_nbond_y, d_f_nbond_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2195  // ncclAllReduce(d_f_nbond_z, d_f_nbond_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2196  // ncclAllReduce(d_f_slow_x, d_f_slow_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2197  // ncclAllReduce(d_f_slow_y, d_f_slow_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2198  // ncclAllReduce(d_f_slow_z, d_f_slow_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2199  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2200  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2201 #endif
2202  }
2203  if(doGlobalMasterStaleForces)
2204  {
2205  memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
2206  memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
2207  computeGlobalMasterVirial(
2208  numPatchesHomeAndProxy,
2209  numAtomsHome,
2210  patchData->devData[deviceIndex].d_localPatches,
2211  d_pos_x,
2212  d_pos_y,
2213  d_pos_z,
2214  d_transform,
2215  d_f_global_x,
2216  d_f_global_y,
2217  d_f_global_z,
2218  &d_extForce[EXT_GLOBALMTS],
2219  &extForce[EXT_GLOBALMTS],
2220  &d_extVirial[EXT_GLOBALMTS],
2221  &extVirial[EXT_GLOBALMTS],
2222  myLattice,
2223  d_tbcatomic,
2224  stream);
2225  }
2226 
2227  // do external forces calculation and store energy and virial
2228  calculateExternalForces(simParams->firstTimestep, maxForceNumber, 1, 1);
2229 #if 0
2230  cudaCheck(cudaDeviceSynchronize());
2231  if(true || deviceID == 0){
2232  char prefix[10];
2233  snprintf(prefix, 10, "step-%d",0);
2234  this->printSOAForces(prefix);
2235  }
2236 #endif
2237 
2238  CUDASequencerKernel->addForceToMomentum(
2239  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2240  d_recipMass,
2241  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2242  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2243  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2244  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2245  numAtomsHome, maxForceNumber, stream);
2246 
2247  CUDASequencerKernel->rattle1(
2248  simParams->fixedAtomsOn,1, 0,
2249  numAtomsHome, -dt_normal, -1.0/(dt_normal),
2250  2.0 * simParams->rigidTol,
2251  d_vel_x, d_vel_y, d_vel_z,
2252  d_pos_x, d_pos_y, d_pos_z,
2253  d_velNew_x, d_velNew_y, d_velNew_z,
2254  d_posNew_x, d_posNew_y, d_posNew_z,
2255  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2256  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2257  &settleList, settleListSize, &d_consFailure,
2258  d_consFailureSize, &rattleList, rattleListSize,
2259  &nSettle, &nRattle,
2260  d_rigidVirial, rigidVirial, d_tbcatomic, true, sp,
2261  true, consFailure, simParams->watmodel, stream);
2262 
2263  CUDASequencerKernel->centerOfMass(
2264  d_vel_x, d_vel_y, d_vel_z,
2265  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2266  d_hydrogenGroupSize, numAtomsHome, stream);
2267 
2268 
2269  {
2270  // SubmitHalf and its corresponding reductions
2271  submitHalf(numAtomsHome, 1, 1);
2272  // submitHalf reductions
2273  cudaCheck(cudaStreamSynchronize(stream));
2274  Tensor reduction_virial;
2275  Tensor reduction_intVirialNormal;
2276  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2277  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2278  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2279  // Haochuan: the tensor is not symmetric when there are fixed atoms
2280  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2281  reduction_virial *= 0.5;
2282  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2283  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2284  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
2285  += (intKineticEnergy_half[0] * 0.25);
2286  reduction_intVirialNormal *= 0.5;
2287  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2288  reduction_intVirialNormal);
2289  }
2290 
2291  CUDASequencerKernel->addForceToMomentum(
2292  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
2293  d_recipMass,
2294  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2295  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2296  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2297  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2298  numAtomsHome, maxForceNumber, stream);
2299 
2300  CUDASequencerKernel->rattle1(
2301  simParams->fixedAtomsOn, 1, 1,
2302  numAtomsHome, dt_normal, 1.0/dt_normal,
2303  2.0 * simParams->rigidTol,
2304  d_vel_x, d_vel_y, d_vel_z,
2305  d_pos_x, d_pos_y, d_pos_z,
2306  d_velNew_x, d_velNew_y, d_velNew_z,
2307  d_posNew_x, d_posNew_y, d_posNew_z,
2308  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2309  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2310  &settleList, settleListSize, &d_consFailure,
2311  d_consFailureSize, &rattleList, rattleListSize,
2312  &nSettle, &nRattle,
2313  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2314  buildRigidLists, consFailure, simParams->watmodel, stream);
2315 
2316  CUDASequencerKernel->centerOfMass(
2317  d_vel_x, d_vel_y, d_vel_z,
2318  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2319  d_hydrogenGroupSize, numAtomsHome, stream);
2320 
2321  {
2322  // JM: SubmitHalf and its corresponding reductions
2323  submitHalf(numAtomsHome, 1, 1);
2324  // submitHalf reductions
2325  cudaCheck(cudaStreamSynchronize(stream));
2326  Tensor reduction_virial;
2327  Tensor reduction_intVirialNormal;
2328  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2329  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2330  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2331  // Haochuan: the tensor is not symmetric when there are fixed atoms
2332  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2333  reduction_virial *= 0.5;
2334  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2335  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2336  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
2337  += (intKineticEnergy_half[0] * 0.25);
2338  reduction_intVirialNormal *= 0.5;
2339  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2340  reduction_intVirialNormal);
2341  }
2342 
2343  CUDASequencerKernel->addForceToMomentum(
2344  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2345  d_recipMass,
2346  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2347  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2348  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2349  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2350  numAtomsHome, maxForceNumber, stream);
2351 
2352  if(requestGlobalForces || requestForcesOutput) {
2353  // store the forces for next step,
2354  // when we need it for colvars and Tcl scripting
2355  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
2356  }
2357 
2358  if (requestGlobalForcesGPU) {
2359  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
2360  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
2361  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
2362  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
2363  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
2364  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
2365  CUDASequencerKernel->copyForcesToDevice(
2366  numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2367  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2368  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
2369  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
2370  // cudaCheck(cudaStreamSynchronize(stream));
2371  }
2372 
2373  CUDASequencerKernel->centerOfMass(
2374  d_vel_x, d_vel_y, d_vel_z,
2375  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2376  d_hydrogenGroupSize, numAtomsHome, stream);
2377 
2378  submitReductions(origin.x, origin.y, origin.z,
2379  marginViolations, 1,
2380  1,
2381  numAtomsHome, maxForceNumber);
2382 
2383  copyPositionsAndVelocitiesToHost(1, 0);
2384 
2385  if(consFailure[0]){
2386  // Constraint failure. Abort.
2387  int dieOnError = simParams->rigidDie;
2388  if(dieOnError){
2389  // Bails out
2390  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
2391  NAMD_die("constraint failure during CUDA rattle!\n");
2392  }else{
2393  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
2394  }
2395  }else if(1){
2396  cudaCheck(cudaStreamSynchronize(stream));
2397  if (doGlobalMasterStaleForces) {
2398  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
2399  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
2400  // PRINT_CUDATENSOR(extVirial[EXT_GLOBALMTS], iout);
2401  }
2402  if(simParams->rigidBonds != RIGID_NONE){
2403  Tensor reduction_rigidVirial;
2404  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
2405  // Haochuan: the tensor is not symmetric when there are fixed atoms
2406  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
2407  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
2408  }
2409 
2410  //submitReductions1
2411  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
2412  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
2413  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
2414  Vector angularMomentum(*angularMomentum_x,
2415  *angularMomentum_y,
2416  *angularMomentum_z);
2417  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
2418  //submitReductions2
2419  Tensor regintVirialNormal;
2420  Tensor regintVirialNbond;
2421  Tensor regintVirialSlow;
2422  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
2423  if (maxForceNumber >= 1) {
2424  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
2425  }
2426  if (maxForceNumber >= 2) {
2427  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
2428  }
2429 
2430  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
2431  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
2432  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
2433  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
2434 
2435  if (simParams->fixedAtomsOn) {
2436  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
2437  double3 fixForceNormal, fixForceNbond, fixForceSlow;
2438  switch (maxForceNumber) {
2439  case 2: {
2440  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
2441  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
2442  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
2443  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
2444  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
2445  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
2446  } // intentionally fallthrough
2447  case 1: {
2448  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
2449  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
2450  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
2451  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
2452  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
2453  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
2454  } // intentionally fallthrough
2455  default: {
2456  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
2457  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
2458  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
2459  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
2460  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
2461  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
2462  }
2463  }
2464 #if 0
2465  auto printTensor = [](const cudaTensor& t, const std::string& name){
2466  CkPrintf("%s", name.c_str());
2467  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
2468  "%12.5lf %12.5lf %12.5lf\n"
2469  "%12.5lf %12.5lf %12.5lf\n",
2470  t.xx, t.xy, t.xz,
2471  t.yx, t.yy, t.yz,
2472  t.zx, t.zy, t.zz);
2473  };
2474  printTensor(fixVirialNormal, "fixVirialNormal = ");
2475  printTensor(fixVirialNbond, "fixVirialNbond = ");
2476  printTensor(fixVirialSlow, "fixVirialSlow = ");
2477 #endif
2478 
2479  }
2480  }
2481 
2482 #if 0
2483  if(deviceID == 0){
2484  this->printSOAForces(NULL);
2485  }
2486 #endif
2487 
2488 #if 0
2489  printSOAPositionsAndVelocities();
2490 #endif
2491 }
2492 
2493 void SequencerCUDA::monteCarloPressure_reject(Lattice &lattice)
2494 {
2495  // Restore the myLattice
2496  myLattice = lattice;
2497  double *temp;
2498 
2499  // Restore positions and forces
2500  temp = d_f_normal_x; d_f_normal_x = d_f_normalMC_x; d_f_normalMC_x = temp;
2501  temp = d_f_normal_y; d_f_normal_y = d_f_normalMC_y; d_f_normalMC_y = temp;
2502  temp = d_f_normal_z; d_f_normal_z = d_f_normalMC_z; d_f_normalMC_z = temp;
2503  temp = d_f_nbond_x; d_f_nbond_x = d_f_nbondMC_x; d_f_nbondMC_x = temp;
2504  temp = d_f_nbond_y; d_f_nbond_y = d_f_nbondMC_y; d_f_nbondMC_y = temp;
2505  temp = d_f_nbond_z; d_f_nbond_z = d_f_nbondMC_z; d_f_nbondMC_z = temp;
2506  temp = d_f_slow_x; d_f_slow_x = d_f_slowMC_x; d_f_slowMC_x = temp;
2507  temp = d_f_slow_y; d_f_slow_y = d_f_slowMC_y; d_f_slowMC_y = temp;
2508  temp = d_f_slow_z; d_f_slow_z = d_f_slowMC_z; d_f_slowMC_z = temp;
2509  #ifdef NAMD_NCCL_ALLREDUCE
2510  if(mGpuOn) {
2511  temp = d_posNew_x; d_posNew_x = d_posMC_x; d_posMC_x = temp;
2512  temp = d_posNew_y; d_posNew_y = d_posMC_y; d_posMC_y = temp;
2513  temp = d_posNew_z; d_posNew_z = d_posMC_z; d_posMC_z = temp;
2514  } else {
2515  temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2516  temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2517  temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2518  }
2519  #else
2520  temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2521  temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2522  temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2523  #endif
2524 }
2525 
2526 void SequencerCUDA::monteCarloPressure_accept(
2527  const int doMigration)
2528 {
2529  // do we need to update center of masses?
2530  CUDASequencerKernel->centerOfMass(
2531  d_pos_x, d_pos_y, d_pos_z,
2532  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2533  d_hydrogenGroupSize, numAtomsHome, stream);
2534 
2535  // Add half step kinetic contribution to energy, intEnergy, virial, intVirial,
2536  // calculated by submitHalf in launch_part11
2537  Tensor reduction_virial;
2538  Tensor reduction_intVirialNormal;
2539  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2540  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2541  // Haochuan: the tensor is not symmetric when there are fixed atoms
2542  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2543  reduction_virial *= 0.5;
2544  reduction_intVirialNormal *= 0.5;
2545  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2546  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2547  reduction_intVirialNormal);
2548  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2549  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY) += (intKineticEnergy_half[0] * 0.25);
2550 
2551  // If we were on migration steps and move was accepted, we need to update
2552  // the myLatticeOld in order to use it in SequencerCUDAKernel::pairlistCheck();
2553  if(doMigration) {
2554  myLatticeOld = myLattice;
2555  }
2556 }
2557 
2558 void SequencerCUDA::monteCarloPressure_part1(
2559  Tensor &factor,
2560  Vector &origin,
2561  Lattice &oldLattice)
2562 {
2563  // Backup positions and forces
2564  //copy_DtoD<double>(d_f_raw, d_f_rawMC, numAtoms*9, stream);
2565  copy_DtoD<double>(d_f_normal_x, d_f_normalMC_x, numAtomsHome, stream);
2566  copy_DtoD<double>(d_f_normal_y, d_f_normalMC_y, numAtomsHome, stream);
2567  copy_DtoD<double>(d_f_normal_z, d_f_normalMC_z, numAtomsHome, stream);
2568  copy_DtoD<double>(d_f_nbond_x, d_f_nbondMC_x, numAtomsHome, stream);
2569  copy_DtoD<double>(d_f_nbond_y, d_f_nbondMC_y, numAtomsHome, stream);
2570  copy_DtoD<double>(d_f_nbond_z, d_f_nbondMC_z, numAtomsHome, stream);
2571  copy_DtoD<double>(d_f_slow_x, d_f_slowMC_x, numAtomsHome, stream);
2572  copy_DtoD<double>(d_f_slow_y, d_f_slowMC_y, numAtomsHome, stream);
2573  copy_DtoD<double>(d_f_slow_z, d_f_slowMC_z, numAtomsHome, stream);
2574 #ifdef NAMD_NCCL_ALLREDUCE
2575  if(mGpuOn) {
2576  //copy_DtoD<double>(d_posNew_raw, d_pos_rawMC, numAtomsHome*3, stream);
2577  copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
2578  copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
2579  copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
2580  } else {
2581  //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
2582  copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2583  copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2584  copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2585  }
2586 #else
2587  //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
2588  copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2589  copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2590  copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2591 #endif
2592 
2593  // Scale the old lattice with factor. We need both lattice and newLattice
2594  // to properly unwrap and wrap the atom's coordinate
2595  Lattice newLattice = oldLattice;
2596  newLattice.rescale(factor);
2597  cudaTensor cuFactor;
2598  cudaVector cuOrigin;
2599  COPY_CUDATENSOR(factor, cuFactor);
2600  COPY_CUDAVECTOR(origin, cuOrigin);
2601 
2602  // Scale the coordinate using Molecule's geometric center
2603  Molecule *molecule = Node::Object()->molecule;
2604  CUDASequencerKernel->scaleCoordinateUsingGC(
2605  d_pos_x, d_pos_y, d_pos_z, d_idOrder, d_moleculeStartIndex,
2606  d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
2607  d_transform, molecule->numMolecules, molecule->numLargeMolecules,
2608  stream);
2609 
2610  // Update the cuda lattice with newLattice for force calculation
2611  myLattice = newLattice;
2612 
2613  // Set up compute position before calling bonded and nonbonded kernel
2614  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2616  bool doNbond = patchData->flags.doNonbonded;
2617  bool doSlow = patchData->flags.doFullElectrostatics;
2618  bool doFEP = false;
2619  bool doTI = false;
2620  bool doAlchDecouple = false;
2621  bool doAlchSoftCore = false;
2622  if (simParams->alchOn) {
2623  if (simParams->alchFepOn) doFEP = true;
2624  if (simParams->alchThermIntOn) doTI = true;
2625  if (simParams->alchDecouple) doAlchDecouple = true;
2626  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2627  }
2628 
2629  if (Node::Object()->molecule->is_lonepairs_psf) {
2630  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
2631  }
2632 
2633  bool usePatchPme = false;
2634  if (deviceCUDA->getIsPmeDevice() && doSlow) {
2635  // This will check if the current lattice is compatible with the patch-level PME kernels
2636  // This needs to be redone every time the lattice changes, the results are stored in
2637  // the cudaPme object, and the overall compatibility can be computed with the compatible()
2638  // function.
2639  //
2640  // The behavior of set compute positions PME is different depending on the kernels being used,
2641  // so that value needs to be passed to the kernel object
2643  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
2645  myLattice,
2646  getNumPatchesHome(),
2647  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
2648  getHostPatchMin(),
2649  getHostPatchMax(),
2650  getHostAwayDists()
2651  );
2652  usePatchPme = cudaPme->patchLevelPmeData.compatible();
2653  }
2654 
2655  std::vector<int> atom_counts;
2656  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2657  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2658  }
2659  CUDASequencerKernel->set_compute_positions(
2660  deviceIndex,
2662  nDevices,
2663  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
2664  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
2665 #ifdef NAMD_NCCL_ALLREDUCE
2666  (mGpuOn) ? d_posNew_x: d_pos_x,
2667  (mGpuOn) ? d_posNew_y: d_pos_y,
2668  (mGpuOn) ? d_posNew_z: d_pos_z,
2669 #else
2670  d_pos_x,
2671  d_pos_y,
2672  d_pos_z,
2673  d_peer_pos_x, // passes double-pointer if mgpuOn
2674  d_peer_pos_y,
2675  d_peer_pos_z,
2676  d_peer_charge,
2677  d_peer_partition,
2678 #endif
2679  d_charge, d_partition, charge_scaling,
2680  d_patchCenter,
2681  patchData->devData[deviceIndex].slow_patchPositions,
2682  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
2683  d_sortOrder, newLattice,
2684  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
2685  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
2687  patchData->devData[deviceIndex].d_localPatches,
2688  patchData->devData[deviceIndex].d_peerPatches,
2689  atom_counts,
2690  stream);
2691 
2692  cudaCheck(cudaStreamSynchronize(stream));
2693 }
2694 
2695 void SequencerCUDA::monteCarloPressure_part2(
2696  int step,
2697  int maxForceNumber,
2698  const bool doEnergy,
2699  const bool doGlobal,
2700  const bool doVirial)
2701 {
2702  // we zero all reduction value in part1. Need to add this
2703  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2704 
2705  if(mGpuOn){
2706 #ifdef NAMD_NCCL_ALLREDUCE
2707  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtoms*3*(maxForceNumber+1)));
2708 #endif
2709  }
2710  //Update SOA buffer
2711  CUDASequencerKernel->accumulateForceToSOA(
2712  doGlobal,
2713  simParams->useCudaGlobal,
2714  maxForceNumber,
2715  numPatchesHomeAndProxy,
2716  nDevices,
2717  patchData->devData[deviceIndex].d_localPatches,
2718  patchData->devData[deviceIndex].f_bond,
2719  patchData->devData[deviceIndex].f_bond_nbond,
2720  patchData->devData[deviceIndex].f_bond_slow,
2721  patchData->devData[deviceIndex].forceStride,
2722  patchData->devData[deviceIndex].f_nbond,
2723  patchData->devData[deviceIndex].f_nbond_slow,
2724  patchData->devData[deviceIndex].f_slow,
2725  d_f_global_x,
2726  d_f_global_y,
2727  d_f_global_z,
2728  d_f_normal_x,
2729  d_f_normal_y,
2730  d_f_normal_z,
2731  d_f_nbond_x,
2732  d_f_nbond_y,
2733  d_f_nbond_z,
2734  d_f_slow_x,
2735  d_f_slow_y,
2736  d_f_slow_z,
2737  d_unsortOrder,
2738  myLattice,
2739  patchData->d_queues,
2740  patchData->d_queueCounters,
2741  d_tbcatomic,
2742  stream
2743  );
2744  if(mGpuOn){
2745 #ifndef NAMD_NCCL_ALLREDUCE
2746  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2747  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
2748  // 0, deviceCUDA->getNcclComm(), stream);
2749  std::vector<int> atom_counts;
2750  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2751  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2752  }
2753  CUDASequencerKernel->mergeForcesFromPeers(
2754  deviceIndex,
2755  maxForceNumber,
2756  myLattice,
2757  numPatchesHomeAndProxy,
2758  numPatchesHome,
2759  this->d_peer_fb_x,
2760  this->d_peer_fb_y,
2761  this->d_peer_fb_z,
2762  this->d_peer_fn_x,
2763  this->d_peer_fn_y,
2764  this->d_peer_fn_z,
2765  this->d_peer_fs_x,
2766  this->d_peer_fs_y,
2767  this->d_peer_fs_z,
2768  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2769  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2770  patchData->devData[deviceIndex].d_localPatches,
2771  patchData->devData[deviceIndex].d_peerPatches,
2772  atom_counts,
2773  stream
2774  );
2775 #else
2776  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2777  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2778 #endif
2779  }
2780  // do external forces calculation
2781  calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
2782 #if 0
2783  cudaCheck(cudaDeviceSynchronize());
2784  if(true || deviceID == 0){
2785  char prefix[10];
2786  snprintf(prefix, 10, "step-%d",step);
2787  this->printSOAForces(prefix);
2788  }
2789 #endif
2790 
2791 }
2792 
2793 void SequencerCUDA::setRescalePairlistTolerance(const bool val) {
2794  rescalePairlistTolerance = val;
2795 }
2796 
2797 void SequencerCUDA::launch_part1(
2798  int step,
2799  double dt_normal,
2800  double dt_nbond,
2801  double dt_slow,
2802  double velrescaling,
2803  const double maxvel2,
2804  Tensor &factor,
2805  Vector &origin,
2806  Lattice &lattice,
2807  int reassignVelocitiesStep,
2808  int langevinPistonStep,
2809  int berendsenPressureStep,
2810  int maxForceNumber,
2811  const int copyIn,
2812  const int savePairlists,
2813  const int usePairlists,
2814  const bool doEnergy)
2815 {
2816  PatchMap* patchMap = PatchMap::Object();
2817  // Aggregate data from all patches
2818  cudaCheck(cudaSetDevice(deviceID));
2819  this->maxvel2 = maxvel2;
2820  const bool doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
2821  const bool doFixed = simParams->fixedAtomsOn;
2822  // JM: for launch_part1:
2823  // copyIn: first call
2824  myLattice = lattice;
2825  if(reassignVelocitiesStep)
2826  {
2827  const int reassignFreq = simParams->reassignFreq;
2828  BigReal newTemp = simParams->reassignTemp;
2829  newTemp += ( step / reassignFreq ) * simParams->reassignIncr;
2830  if ( simParams->reassignIncr > 0.0 ) {
2831  if ( newTemp > simParams->reassignHold && simParams->reassignHold > 0.0 )
2832  newTemp = simParams->reassignHold;
2833  } else {
2834  if ( newTemp < simParams->reassignHold )
2835  newTemp = simParams->reassignHold;
2836  }
2837  const BigReal kbT = BOLTZMANN * newTemp;
2838 
2839  CUDASequencerKernel->reassignVelocities(
2840  dt_normal, simParams->fixedAtomsOn, d_atomFixed,
2841  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
2842  d_vel_x, d_vel_y, d_vel_z,
2843  d_recipMass, kbT,
2844  numAtomsHome, numAtomsHome, 0,
2845  curandGen, stream);
2846  }
2847 
2848  // scale the position for berendsen Pressure Controller
2849  if(berendsenPressureStep) {
2850  cudaTensor cuFactor;
2851  cudaVector cuOrigin;
2852  COPY_CUDATENSOR(factor, cuFactor);
2853  COPY_CUDAVECTOR(origin, cuOrigin);
2854  CUDASequencerKernel->scaleCoordinateWithFactor(
2855  d_pos_x, d_pos_y, d_pos_z, d_mass, d_hydrogenGroupSize,
2856  cuFactor, cuOrigin, simParams->useGroupPressure, numAtomsHome, stream);
2857  }
2858 
2859  if(!langevinPistonStep){
2860  // kernel fusion here
2861  // JM TODO: Fuse kernels for the langevin thermostat
2862  CUDASequencerKernel->velocityVerlet1(
2863  doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
2864  dt_slow, velrescaling, d_recipMass,
2865  d_vel_x, d_vel_y, d_vel_z, maxvel2, killme, d_pos_x, d_pos_y, d_pos_z,
2866  pos_x, pos_y, pos_z, d_f_normal_x, d_f_normal_y, d_f_normal_z,
2867  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z,
2868  d_atomFixed, numAtomsHome, maxForceNumber, stream);
2869  }else{
2870  // Zero-out force buffers here
2871  CUDASequencerKernel->addForceToMomentum(
2872  doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
2873  d_recipMass,
2874  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2875  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2876  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2877  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2878  numAtomsHome, maxForceNumber, stream);
2879 
2880  maximumMove(maxvel2, numAtomsHome);
2881  cudaTensor cuFactor;
2882  cudaVector cuOrigin;
2883  COPY_CUDATENSOR(factor, cuFactor);
2884  COPY_CUDAVECTOR(origin, cuOrigin);
2885  double velFactor_x = namd_reciprocal(factor.xx);
2886  double velFactor_y = namd_reciprocal(factor.yy);
2887  double velFactor_z = namd_reciprocal(factor.zz);
2888 
2889  CUDASequencerKernel->addVelocityToPosition(
2890  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2891  d_vel_x, d_vel_y, d_vel_z,
2892  d_pos_x, d_pos_y, d_pos_z,
2893  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2894  CUDASequencerKernel->langevinPiston(
2895  simParams->fixedAtomsOn, d_atomFixed,
2896  d_groupFixed, d_transform, lattice,
2897  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
2898  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
2899  d_mass, d_hydrogenGroupSize,
2900  cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
2901  simParams->useGroupPressure, numAtomsHome, stream);
2902  CUDASequencerKernel->addVelocityToPosition(
2903  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2904  d_vel_x, d_vel_y, d_vel_z,
2905  d_pos_x, d_pos_y, d_pos_z,
2906  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2907  }
2908  if(mGpuOn && SMDKernel)
2909  {
2910  // compute distributed center of mass for groups of atoms
2911  SMDKernel->computeCOMMGpu(lattice, d_mass, d_pos_x, d_pos_y, d_pos_z,
2912  d_transform, stream);
2913  }
2914  if(mGpuOn && groupRestraintsKernel)
2915  {
2916  groupRestraintsKernel->doCOM_mgpu(lattice, d_transform,
2917  d_mass, d_pos_x, d_pos_y, d_pos_z,
2918  stream);
2919  }
2920 
2921  // JM: Recalculate centers of mass if energy calculation or langevinPistonOn
2922  if( (doEnergy || doVirial) ) {
2923  CUDASequencerKernel->centerOfMass(
2924  d_pos_x, d_pos_y, d_pos_z,
2925  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2926  d_hydrogenGroupSize, numAtomsHome, stream);
2927  CUDASequencerKernel->centerOfMass(
2928  d_vel_x, d_vel_y, d_vel_z,
2929  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2930  d_hydrogenGroupSize, numAtomsHome, stream);
2931  }
2932 
2933  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2935  // We need to find doNbond and doSlow for upcoming step
2936  bool doNbond = patchData->flags.doNonbonded;
2937  bool doSlow = patchData->flags.doFullElectrostatics;
2938 
2939  bool doFEP = false;
2940  bool doTI = false;
2941  bool doAlchDecouple = false;
2942  bool doAlchSoftCore = false;
2943  if (simParams->alchOn) {
2944  if (simParams->alchFepOn) doFEP = true;
2945  if (simParams->alchThermIntOn) doTI = true;
2946  if (simParams->alchDecouple) doAlchDecouple = true;
2947  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2948  }
2949  if ( ! savePairlists ) {
2950 
2951  double minSize = simParams->patchDimension - simParams->margin;
2952  double sysdima = lattice.a_r().unit() * lattice.a();
2953  double sysdimb = lattice.b_r().unit() * lattice.b();
2954  double sysdimc = lattice.c_r().unit() * lattice.c();
2955  // Let's pass migrationInfo here
2956 
2957  CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
2958  patchData->devData[deviceIndex].d_localPatches,
2959  d_pos_x, d_pos_y, d_pos_z, d_posSave_x, d_posSave_y, d_posSave_z,
2960  d_awayDists,
2961  myLattice, myLatticeOld,
2962  d_patchMin, d_patchMax, d_patchCenter,
2963  d_mInfo,
2964  d_tbcatomic, simParams->pairlistTrigger,
2965  simParams->pairlistGrow, simParams->pairlistShrink,
2966  d_patchMaxAtomMovement, patchMaxAtomMovement,
2967  d_patchNewTolerance, patchNewTolerance,
2968  minSize, simParams->cutoff, sysdima, sysdimb, sysdimc,
2969  h_marginViolations,
2970  h_periodicCellSmall,
2971  rescalePairlistTolerance,
2972  isPeriodic, stream);
2973  rescalePairlistTolerance = false;
2974  }
2975  else {
2976  rescalePairlistTolerance = true;
2977  }
2978 
2979  if(mGpuOn){
2980  // Synchonize device before node barrier
2981  cudaCheck(cudaStreamSynchronize(stream));
2982  }
2983 }
2984 
2985 void SequencerCUDA::launch_part11(
2986  double dt_normal,
2987  double dt_nbond,
2988  double dt_slow,
2989  double velrescaling,
2990  const double maxvel2,
2991  Tensor &factor,
2992  Vector &origin,
2993  Lattice &lattice,
2994  int langevinPistonStep,
2995  int maxForceNumber,
2996  const int copyIn,
2997  const int savePairlists,
2998  const int usePairlists,
2999  const bool doEnergy)
3000 {
3001  const bool doVirial = simParams->langevinPistonOn;
3002  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3004  // We need to find doNbond and doSlow for upcoming step
3005  bool doNbond = patchData->flags.doNonbonded;
3006  bool doSlow = patchData->flags.doFullElectrostatics;
3007 
3008  bool doFEP = false;
3009  bool doTI = false;
3010  bool doAlchDecouple = false;
3011  bool doAlchSoftCore = false;
3012  if (simParams->alchOn) {
3013  if (simParams->alchFepOn) doFEP = true;
3014  if (simParams->alchThermIntOn) doTI = true;
3015  if (simParams->alchDecouple) doAlchDecouple = true;
3016  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3017  }
3018 
3019  submitHalf(numAtomsHome, 1, doEnergy || doVirial);
3020 
3021  // Updating numerical flags
3022  NAMD_EVENT_START(1, NamdProfileEvent::CPY_PATCHFLAGS);
3023  this->update_patch_flags();
3024  NAMD_EVENT_STOP(1, NamdProfileEvent::CPY_PATCHFLAGS);
3025 
3026  finish_part1(copyIn, patchList[0]->flags.savePairlists,
3027  patchList[0]->flags.usePairlists);
3028 }
3029 
3030 
3031 void SequencerCUDA::launch_set_compute_positions() {
3032 
3033  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3035  // We need to find doNbond and doSlow for upcoming step
3036  bool doNbond = patchData->flags.doNonbonded;
3037  bool doSlow = patchData->flags.doFullElectrostatics;
3038 
3039  bool doFEP = false;
3040  bool doTI = false;
3041  bool doAlchDecouple = false;
3042  bool doAlchSoftCore = false;
3043  if (simParams->alchOn) {
3044  if (simParams->alchFepOn) doFEP = true;
3045  if (simParams->alchThermIntOn) doTI = true;
3046  if (simParams->alchDecouple) doAlchDecouple = true;
3047  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3048  }
3049  bool doGlobal = simParams->tclForcesOn || simParams->colvarsOn || simParams->IMDon;
3050  // Set the positions of lone pairs before copying to NB buffers
3051  if (Node::Object()->molecule->is_lonepairs_psf) {
3052  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3053  }
3054 
3055  bool usePatchPme = false;
3056  if (deviceCUDA->getIsPmeDevice() && doSlow) {
3057  // This will check if the current lattice is compatible with the patch-level PME kernels
3058  // This needs to be redone every time the lattice changes, the results are stored in
3059  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3060  // function.
3061  //
3062  // The behavior of set compute positions PME is different depending on the kernels being used,
3063  // so that value needs to be passed to the kernel object
3065  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3067  myLattice,
3068  getNumPatchesHome(),
3069  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3070  getHostPatchMin(),
3071  getHostPatchMax(),
3072  getHostAwayDists()
3073  );
3074  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3075  }
3076 
3077  if (1) {
3078  //fprintf(stderr, "calling set_compute_positions() ****************************************\n");
3079  //fprintf(stderr, "calling set_compute_positions\n");
3080  //fprintf(stderr, "doNbond=%d doSlow=%d\n", doNbond, doSlow);
3081  std::vector<int> atom_counts;
3082  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3083  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3084  }
3085  CUDASequencerKernel->set_compute_positions(
3086  deviceIndex,
3088  nDevices,
3089  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3090  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3091 #ifdef NAMD_NCCL_ALLREDUCE
3092  (mGpuOn) ? d_posNew_x: d_pos_x,
3093  (mGpuOn) ? d_posNew_y: d_pos_y,
3094  (mGpuOn) ? d_posNew_z: d_pos_z,
3095 #else
3096  d_pos_x,
3097  d_pos_y,
3098  d_pos_z,
3099  d_peer_pos_x, // passes double-pointer if mgpuOn
3100  d_peer_pos_y,
3101  d_peer_pos_z,
3102  d_peer_charge,
3103  d_peer_partition,
3104 #endif
3105  d_charge, d_partition, charge_scaling,
3106  d_patchCenter,
3107  patchData->devData[deviceIndex].slow_patchPositions,
3108  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3109  d_sortOrder, myLattice,
3110  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3111  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3113  patchData->devData[deviceIndex].d_localPatches,
3114  patchData->devData[deviceIndex].d_peerPatches,
3115  atom_counts,
3116  stream);
3117  // For global forces, copy the coordinate to host with kernel overlap
3118  if (doGlobal) {
3119  NAMD_EVENT_START(1, NamdProfileEvent::GM_CPY_POSITION);
3120  // CkPrintf("WARNING this probably needs to be changed for multihost\n");
3121  copyPositionsToHost_direct();
3122  //copyPositionsAndVelocitiesToHost(1,0);
3123  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CPY_POSITION);
3124  }
3125  }
3126 }
3127 
3128 void SequencerCUDA:: finish_part1( const int copyIn,
3129  const int savePairlists,
3130  const int usePairlists)
3131 {
3132  // JM: If we're not in a migration step, let's overlap the flagging the
3133  // positions before we synchronize the stream to lessen the compute
3134  // launch overhead
3135  // Hopefully we will see some overlap in this region
3136  //
3137  // TODO: We can just call this a different function and start calling positionsReady
3138  // if we're not on a migration step, so that we can overlap some of the work with the kernels
3139  // before we synchronize, we can clear the device memory
3140  // sets the tileListStat for the nbondKernel
3141  cudaCheck(cudaStreamSynchronize(stream));
3142 
3143  // Checks if periodic cell became too small
3144  if(*h_periodicCellSmall){
3145  NAMD_die("Periodic cell has become too small for original patch grid!\n"
3146  "Possible solutions are to restart from a recent checkpoint,\n"
3147  "increase margin, or disable useFlexibleCell for liquid simulation.");
3148  }
3149 
3150  if (killme[0]) {
3151  const Molecule* mol = Node::Object()->molecule;
3152  // Found at least one atom that is moving too fast.
3153  // Terminating, so loop performance below doesn't matter.
3154  // Loop does not vectorize
3155  double *vel_x, *vel_y, *vel_z;
3156  std::vector<int> id;
3157  std::vector<int> patchIDofAtoms(numAtomsHome);
3158  allocate_host<double>(&vel_x, numAtomsHome);
3159  allocate_host<double>(&vel_y, numAtomsHome);
3160  allocate_host<double>(&vel_z, numAtomsHome);
3161  copy_DtoH_sync<double>(d_vel_x, vel_x, numAtomsHome);
3162  copy_DtoH_sync<double>(d_vel_y, vel_y, numAtomsHome);
3163  copy_DtoH_sync<double>(d_vel_z, vel_z, numAtomsHome);
3164  // Update the id array from patchDataSOA
3165  size_t offset = 0;
3166  // TODO: Does this work for GPU atom migration?
3167  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3168  // std::vector<int> patchIDtoIndex(numPatchesHome);
3169  for (int i = 0; i < numPatchesHome; i++) {
3170  HomePatch *patch = homePatches[i];
3171  PatchDataSOA& current = patchList[i]->patchDataSOA;
3172  const int numPatchAtoms = current.numAtoms;
3173  id.resize(numPatchAtoms + id.size());
3174  for(int j = 0; j < numPatchAtoms; j++){
3175  if (!simParams->useDeviceMigration) {
3176  id[offset + j] = current.id[j];
3177  }
3178  patchIDofAtoms[offset + j] = patch->getPatchID();
3179  }
3180  offset += numPatchAtoms;
3181  }
3182  if (simParams->useDeviceMigration) {
3183  id.resize(numAtomsHome);
3184  copy_DtoH_sync<int>(d_idMig, id.data(), numAtomsHome);
3185  }
3186  int cnt = 0;
3187  for (int i=0; i < numAtomsHome; i++) {
3188  BigReal vel2 =
3189  vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3190  if (vel2 > maxvel2) {
3191  ++cnt;
3192  iout << iERROR << " velocity is "
3193  << PDBVELFACTOR * vel_x[i] << " "
3194  << PDBVELFACTOR * vel_y[i] << " "
3195  << PDBVELFACTOR * vel_z[i]
3196  << " (limit is "
3197  << ( PDBVELFACTOR * sqrt(maxvel2) ) << ", atom ID "
3198  << id[i]+1
3199  // XXX: TODO: mol->get_atomtype only works on a single-node
3200  << " of type " << mol->get_atomtype(id[i])
3201  << " in patch " << patchIDofAtoms[i]
3202  << " on PE " << CkMyPe()
3203  << " with " << patchList[globalToLocalID[patchIDofAtoms[i]]]->patchDataSOA.numAtoms
3204  << " atoms)\n" << endi;
3205  }
3206  }
3207  iout << iERROR << "Atoms moving too fast at timestep " << patchList[0]->flags.step <<
3208  "; simulation has become unstable ("
3209  << cnt << " atoms on pe " << CkMyPe() << ", GPU " << deviceID << ").\n" << endi;
3210  if (simParams->crashOutputFlag & NAMD_CRASH_ATOM_TOO_FAST) {
3211  double *pos_x, *pos_y, *pos_z;
3212  allocate_host<double>(&pos_x, numAtomsHome);
3213  allocate_host<double>(&pos_y, numAtomsHome);
3214  allocate_host<double>(&pos_z, numAtomsHome);
3215  copy_DtoH_sync<double>(d_pos_x, pos_x, numAtomsHome);
3216  copy_DtoH_sync<double>(d_pos_y, pos_y, numAtomsHome);
3217  copy_DtoH_sync<double>(d_pos_z, pos_z, numAtomsHome);
3218  // Save the positions and velocities to a file for debugging
3219  const std::string outfilename =
3220  std::string(simParams->crashFilename) + "." +
3221  std::to_string(deviceIndex);
3222  std::ofstream ofs_crash_dump(outfilename.c_str());
3223  ofs_crash_dump << "atom,r_x,r_y,r_z,v_x,v_y,v_z\n";
3224  for (int i=0; i < numAtomsHome; i++) {
3225  ofs_crash_dump << id[i]+1 << ","
3226  << pos_x[i] << ","
3227  << pos_y[i] << ","
3228  << pos_z[i] << ","
3229  << PDBVELFACTOR * vel_x[i] << ","
3230  << PDBVELFACTOR * vel_y[i] << ","
3231  << PDBVELFACTOR * vel_z[i] << "\n";
3232  }
3233  ofs_crash_dump.flush();
3234  ofs_crash_dump.close();
3235  iout << iWARN << "PE " << CkMyPe() << ", GPU " << deviceID
3236  << ": the atom positions and velocities have been written to "
3237  << outfilename << "\n" << endi;
3238  deallocate_host<double>(&pos_x);
3239  deallocate_host<double>(&pos_y);
3240  deallocate_host<double>(&pos_z);
3241  }
3242  deallocate_host<double>(&vel_x);
3243  deallocate_host<double>(&vel_y);
3244  deallocate_host<double>(&vel_z);
3245  NAMD_die("SequencerCUDA: Atoms moving too fast");
3246  }
3247  else{
3248  // submitHalf reductions
3249  Tensor reduction_virial;
3250  Tensor reduction_intVirialNormal;
3251  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3252  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3253  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3254  // Haochuan: the tensor is not symmetric when there are fixed atoms
3255  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3256  reduction_virial *= 0.5;
3257  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3258  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
3259  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
3260  += (intKineticEnergy_half[0] * 0.25);
3261  reduction_intVirialNormal *= 0.5;
3262  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3263  reduction_intVirialNormal);
3264  int migration = (h_marginViolations[0] != 0) ? 1 :0; // flags migration as TRUE if margin violation occured
3265  // if(migration != 0 ) fprintf(stderr, "DEV[%d] = MIGRATION[%d]\n", deviceID, migration);
3266  patchData->migrationFlagPerDevice[deviceIndex] = migration; // Saves the updated migration flag
3267  h_marginViolations[0] = 0;
3268  }
3269 }
3270 
3271 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
3272  bool copyOut, const int doGlobal){
3273  // CkPrintf("copy positions and velocities to host copyout %d doGlobal %d\n", copyOut, doGlobal);
3274  if(copyOut){
3275  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3276  patchData = cpdata.ckLocalBranch();
3277  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3278  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3279  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3280  const int numAtomsToCopy = numAtomsHome;
3281  copy_DtoH<double>(d_vel_x, vel_x, numAtomsToCopy, stream);
3282  copy_DtoH<double>(d_vel_y, vel_y, numAtomsToCopy, stream);
3283  copy_DtoH<double>(d_vel_z, vel_z, numAtomsToCopy, stream);
3284  if (!doGlobal) {
3285  // We already copied coordinate if we have global forces
3286  copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3287  copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3288  copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3289  }
3290  cudaCheck(cudaDeviceSynchronize());
3291 
3292  for(int i = 0; i < homePatches.size(); i++){
3293 
3294  // TODO do we need to copy proxy patches as well
3295  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3296  const int numPatchAtoms = localPatches[i].numAtoms;
3297  const int offset = localPatches[i].bufferOffset;
3298  memcpy(current.vel_x, vel_x + offset, numPatchAtoms*sizeof(double));
3299  memcpy(current.vel_y, vel_y + offset, numPatchAtoms*sizeof(double));
3300  memcpy(current.vel_z, vel_z + offset, numPatchAtoms*sizeof(double));
3301  if (!doGlobal) {
3302  // We already copied coordinate if we have global forces
3303  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3304  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3305  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3306  }
3307  }
3308  }
3309 }
3310 
3311 
3312 void SequencerCUDA::copyPositionsToHost(){
3313  // CkPrintf("copy positions to host \n");
3314  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3315  patchData = cpdata.ckLocalBranch();
3316  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3317  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3318  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3319 
3320  const int numAtomsToCopy = numAtomsHome;
3321  // We already copied coordinate if we have global forces
3322  copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3323  copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3324  copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3325  cudaCheck(cudaDeviceSynchronize());
3326 
3327  for(int i = 0; i < homePatches.size(); i++){
3328 
3329  // TODO do we need to copy proxy patches as well
3330  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3331  const int numPatchAtoms = localPatches[i].numAtoms;
3332  const int offset = localPatches[i].bufferOffset;
3333  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3334  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3335  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3336  }
3337 }
3338 
3339 void SequencerCUDA::update_patch_flags()
3340 {
3341  // int pairlists = 1;
3342  int pairlists = (patchData->flags.step < simParams->N);
3343  for (int i=0; i < numPatchesHome; i++) {
3344  HomePatch *patch = patchList[i];
3345  patch->flags.copyIntFlags(patchData->flags); // copy global flags
3346  }
3347 }
3348 
3349 void SequencerCUDA::updatePairlistFlags(const int doMigration){
3350  int pairlists = patchList[0]->flags.step < simParams->N;
3351  for(int i = 0; i < numPatchesHome; i++){
3352  //for(int i = 0; i < numPatches; i++){
3353  HomePatch *patch = patchList[i];
3354  Sequencer *seq = patch->sequencer;
3355  // the following logic is duplicated from Sequencer::runComputeObjects
3356  // Migrations always invalidates pairlists
3357  if (doMigration) {
3358  seq->pairlistsAreValid = 0;
3359  }
3360  if (seq->pairlistsAreValid &&
3361  ( patch->flags.doFullElectrostatics || ! simParams->fullElectFrequency )
3362  && (seq->pairlistsAge > seq->pairlistsAgeLimit) ) {
3363  seq->pairlistsAreValid = 0;
3364  }
3365  patch->flags.usePairlists = pairlists || seq->pairlistsAreValid;
3366  patch->flags.savePairlists = pairlists && !seq->pairlistsAreValid;
3367  if(patch->flags.savePairlists){
3368  // We need to rebuild pairlists -> reset tolerance values
3369  patch->flags.pairlistTolerance = patchList[i]->doPairlistCheck_newTolerance; // update pairListTolerance
3370  patch->flags.maxAtomMovement = 0;
3371  patch->doPairlistCheck_newTolerance *= (1 - simParams->pairlistShrink);
3372  }else if(patch->flags.usePairlists){
3373  // We can keep going with the existing pairlists -> update tolerances
3374  patch->flags.maxAtomMovement = patchMaxAtomMovement[i];
3375  patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
3376  }else{
3377  // End of simulation
3378  patch->flags.maxAtomMovement=99999.;
3379  patch->flags.pairlistTolerance = 0.;
3380  }
3381  }
3382  if(patchList[0]->flags.savePairlists){
3383  // Backs up d_posSave_* for pairlistCheck
3384  copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
3385  copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
3386  copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
3387  myLatticeOld = myLattice;
3388  }
3389 }
3390 
3391 void SequencerCUDA::finish_patch_flags(int migration)
3392 {
3393  for (int i=0; i < numPatchesHome; i++) {
3394  HomePatch *patch = patchList[i];
3395  Sequencer *seq = patch->sequencer;
3396  if (patch->flags.savePairlists && patch->flags.doNonbonded) {
3397  seq->pairlistsAreValid = 1;
3398  seq->pairlistsAge = 0;
3399  }
3400  if (seq->pairlistsAreValid /* && ! pressureStep */) {
3401  ++(seq->pairlistsAge);
3402  }
3403  }
3404 }
3405 
3406 
3407 void SequencerCUDA::launch_part2(
3408  const int doMCPressure,
3409  double dt_normal,
3410  double dt_nbond,
3411  double dt_slow,
3412  Vector &origin,
3413  int step,
3414  int maxForceNumber,
3415  const int langevinPistonStep,
3416  const int copyIn,
3417  const int copyOut,
3418  const int doGlobal,
3419  const bool doEnergy)
3420 {
3421  PatchMap* patchMap = PatchMap::Object();
3422  Tensor localVirial;
3423  //cudaTensor h_rigidVirial;
3424  bool doNbond = false;
3425  bool doSlow = false;
3426  cudaCheck(cudaSetDevice(deviceID));
3427  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3428  const int is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3429  // const int doVirial = langevinPistonStep;
3430  // JM: For launch_part2:
3431  // copyIn = migration steps
3432 
3433  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
3434 
3435  if(mGpuOn){
3436 #ifdef NAMD_NCCL_ALLREDUCE
3437  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
3438 #endif
3439  }
3440 
3441  if(!simParams->langevinOn && !simParams->eFieldOn && !simParams->constraintsOn &&
3442  !simParams->SMDOn && !simParams->groupRestraintsOn && !doMCPressure &&
3443  !simParams->mgridforceOn && ! simParams->gridforceOn && !mGpuOn &&
3444  !simParams->consForceOn &&
3445  (simParams->watmodel == WaterModel::TIP3) &&
3446  (!is_lonepairs_psf)){
3447  CUDASequencerKernel->accumulate_force_kick(
3448  simParams->fixedAtomsOn,
3449  doGlobal,
3450  simParams->useCudaGlobal,
3451  maxForceNumber,
3452  numPatchesHomeAndProxy,
3453  patchData->devData[deviceIndex].d_localPatches,
3454  patchData->devData[deviceIndex].f_bond,
3455  patchData->devData[deviceIndex].f_bond_nbond,
3456  patchData->devData[deviceIndex].f_bond_slow,
3457  patchData->devData[deviceIndex].forceStride,
3458  patchData->devData[deviceIndex].f_nbond,
3459  patchData->devData[deviceIndex].f_nbond_slow,
3460  patchData->devData[deviceIndex].f_slow,
3461  d_f_global_x,
3462  d_f_global_y,
3463  d_f_global_z,
3464  d_f_normal_x,
3465  d_f_normal_y,
3466  d_f_normal_z,
3467  d_f_nbond_x,
3468  d_f_nbond_y,
3469  d_f_nbond_z,
3470  d_f_slow_x,
3471  d_f_slow_y,
3472  d_f_slow_z,
3473  d_vel_x,
3474  d_vel_y,
3475  d_vel_z,
3476  d_recipMass,
3477  d_atomFixed,
3478  dt_normal,
3479  dt_nbond,
3480  dt_slow,
3481  1.0,
3482  d_unsortOrder,
3483  myLattice,
3484  stream
3485  );
3486  }else{
3487  CUDASequencerKernel->accumulateForceToSOA(
3488  doGlobal,
3489  simParams->useCudaGlobal,
3490  maxForceNumber,
3491  numPatchesHomeAndProxy,
3492  nDevices,
3493  patchData->devData[deviceIndex].d_localPatches,
3494  patchData->devData[deviceIndex].f_bond,
3495  patchData->devData[deviceIndex].f_bond_nbond,
3496  patchData->devData[deviceIndex].f_bond_slow,
3497  patchData->devData[deviceIndex].forceStride,
3498  patchData->devData[deviceIndex].f_nbond,
3499  patchData->devData[deviceIndex].f_nbond_slow,
3500  patchData->devData[deviceIndex].f_slow,
3501  d_f_global_x,
3502  d_f_global_y,
3503  d_f_global_z,
3504  d_f_normal_x,
3505  d_f_normal_y,
3506  d_f_normal_z,
3507  d_f_nbond_x,
3508  d_f_nbond_y,
3509  d_f_nbond_z,
3510  d_f_slow_x,
3511  d_f_slow_y,
3512  d_f_slow_z,
3513  d_unsortOrder,
3514  myLattice,
3515  patchData->d_queues,
3516  patchData->d_queueCounters,
3517  d_tbcatomic,
3518  stream
3519  );
3520 
3521  }
3522 
3523  if (mGpuOn) {
3524  // Synchonize device before node barrier
3525  cudaCheck(cudaDeviceSynchronize());
3526  }
3527 }
3528 
3529 // launch_part2 is broken into 2 part to support MC barostat
3530 void SequencerCUDA::launch_part3(
3531  const int doMCPressure,
3532  double dt_normal,
3533  double dt_nbond,
3534  double dt_slow,
3535  Vector &origin,
3536  int step,
3537  int maxForceNumber,
3538  const bool requestGlobalForces,
3539  const int doGlobalStaleForces,
3540  const bool forceRequestedGPU,
3541  const int copyIn,
3542  const int copyOut,
3543  const bool doEnergy,
3544  const bool requestForcesOutput)
3545 {
3546  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3547  const bool doFixed = simParams->fixedAtomsOn;
3548  const double velrescaling = 1; // no rescaling
3549 
3550  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3551  simParams->SMDOn || simParams->groupRestraintsOn || requestGlobalForces || simParams->gridforceOn|| simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3552  (simParams->watmodel != WaterModel::TIP3) ||
3554  if(mGpuOn){
3555 #ifndef NAMD_NCCL_ALLREDUCE
3556  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
3557  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
3558  // 0, deviceCUDA->getNcclComm(), stream);
3559 
3560  std::vector<int> atom_counts;
3561  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3562  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3563  }
3564  CUDASequencerKernel->mergeForcesFromPeers(
3565  deviceIndex,
3566  maxForceNumber,
3567  myLattice,
3568  numPatchesHomeAndProxy,
3569  numPatchesHome,
3570  this->d_peer_fb_x,
3571  this->d_peer_fb_y,
3572  this->d_peer_fb_z,
3573  this->d_peer_fn_x,
3574  this->d_peer_fn_y,
3575  this->d_peer_fn_z,
3576  this->d_peer_fs_x,
3577  this->d_peer_fs_y,
3578  this->d_peer_fs_z,
3579  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
3580  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
3581  patchData->devData[deviceIndex].d_localPatches,
3582  patchData->devData[deviceIndex].d_peerPatches,
3583  atom_counts,
3584  stream
3585  );
3586 #else
3587  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
3588  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
3589 #endif
3590  }
3591  if(doVirial && doGlobalStaleForces)
3592  {
3593  memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
3594  memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
3595  computeGlobalMasterVirial(
3596  numPatchesHomeAndProxy,
3597  numAtomsHome,
3598  patchData->devData[deviceIndex].d_localPatches,
3599  d_pos_x,
3600  d_pos_y,
3601  d_pos_z,
3602  d_transform,
3603  d_f_global_x,
3604  d_f_global_y,
3605  d_f_global_z,
3606  &d_extForce[EXT_GLOBALMTS],
3607  &extForce[EXT_GLOBALMTS],
3608  &d_extVirial[EXT_GLOBALMTS],
3609  &extVirial[EXT_GLOBALMTS],
3610  myLattice,
3611  d_tbcatomic,
3612  stream);
3613  }
3614  calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
3615 #if 0
3616  cudaCheck(cudaDeviceSynchronize());
3617  if(true || deviceID == 0){
3618  char prefix[10];
3619  snprintf(prefix, 10, "step-%d",step);
3620  this->printSOAForces(prefix);
3621  }
3622 #endif
3623 
3624  }
3625 
3626  if (simParams->langevinOn) {
3627  CUDASequencerKernel->langevinVelocitiesBBK1(
3628  dt_normal, d_langevinParam, d_vel_x, d_vel_y, d_vel_z, numAtomsHome, stream);
3629  }
3630 
3631  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3632  simParams->SMDOn || simParams->groupRestraintsOn || doMCPressure || simParams->gridforceOn || simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3633  (simParams->watmodel != WaterModel::TIP3) ||
3635  CUDASequencerKernel->addForceToMomentum(
3636  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
3637  d_recipMass,
3638  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3639  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3640  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3641  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3642  numAtomsHome, maxForceNumber, stream);
3643  }
3644 
3645  if (simParams->langevinOn) {
3646 
3647  // must enforce rigid bond constraints if langevin gammas differ
3648  if (simParams->rigidBonds != RIGID_NONE &&
3649  simParams->langevinGammasDiffer) {
3650  CUDASequencerKernel->rattle1(
3651  simParams->fixedAtomsOn, doEnergy || doVirial,
3652  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3653  2.0 * simParams->rigidTol,
3654  d_vel_x, d_vel_y, d_vel_z,
3655  d_pos_x, d_pos_y, d_pos_z,
3656  d_velNew_x, d_velNew_y, d_velNew_z,
3657  d_posNew_x, d_posNew_y, d_posNew_z,
3658  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3659  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3660  &settleList, settleListSize, &d_consFailure,
3661  d_consFailureSize, &rattleList, rattleListSize,
3662  &nSettle, &nRattle,
3663  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3664  buildRigidLists, consFailure, simParams->watmodel, stream);
3665  buildRigidLists = false;
3666  }
3667  CUDASequencerKernel->langevinVelocitiesBBK2(
3668  dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
3669  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
3670  d_vel_x, d_vel_y, d_vel_z,
3671  numAtomsHome, numAtomsHome, 0,
3672  curandGen, stream);
3673  }
3674  if(simParams->rigidBonds != RIGID_NONE){
3675  CUDASequencerKernel->rattle1(
3676  simParams->fixedAtomsOn, doEnergy || doVirial,
3677  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3678  2.0 * simParams->rigidTol,
3679  d_vel_x, d_vel_y, d_vel_z,
3680  d_pos_x, d_pos_y, d_pos_z,
3681  d_velNew_x, d_velNew_y, d_velNew_z,
3682  d_posNew_x, d_posNew_y, d_posNew_z,
3683  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3684  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3685  &settleList, settleListSize, &d_consFailure,
3686  d_consFailureSize, &rattleList, rattleListSize,
3687  &nSettle, &nRattle,
3688  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3689  buildRigidLists, consFailure, simParams->watmodel, stream);
3690  buildRigidLists = false;
3691  }
3692 
3693  // Update velocity center of mass here
3694  if(doEnergy || doVirial){
3695  CUDASequencerKernel->centerOfMass(
3696  d_vel_x, d_vel_y, d_vel_z,
3697  d_vcm_x, d_vcm_y, d_vcm_z,
3698  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
3699  }
3700 
3701  submitHalf(numAtomsHome, 2, doEnergy || doVirial);
3702 
3703  CUDASequencerKernel->addForceToMomentum(
3704  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
3705  d_recipMass,
3706  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3707  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3708  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3709  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3710  numAtomsHome, maxForceNumber, stream);
3711 
3712  if(requestGlobalForces || requestForcesOutput) {
3713  // store the forces for next step,
3714  // when we need it for colvars and Tcl scripting
3715  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
3716  }
3717 
3718  if (forceRequestedGPU) {
3719  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
3720  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
3721  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
3722  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
3723  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
3724  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
3725  CUDASequencerKernel->copyForcesToDevice(
3726  numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3727  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3728  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
3729  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
3730  // cudaCheck(cudaStreamSynchronize(stream));
3731  }
3732 
3733  //cudaCheck(cudaStreamSynchronize(stream));
3734  submitReductions(origin.x, origin.y, origin.z,
3735  marginViolations, doEnergy || doVirial,
3736  copyOut && simParams->outputMomenta != 0,
3737  numAtomsHome, maxForceNumber);
3738  // This is for collecting coordinate and velocity to print
3739  copyPositionsAndVelocitiesToHost(copyOut, 0);
3740 
3741  if(consFailure[0]){
3742  // Constraint failure. Abort.
3743  int dieOnError = simParams->rigidDie;
3744  if(dieOnError){
3745  // Bails out
3746  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
3747  NAMD_die("constraint failure during CUDA rattle!\n");
3748  }else{
3749  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
3750  }
3751  }else if(doEnergy || doVirial){
3752  cudaCheck(cudaStreamSynchronize(stream));
3753  if(doVirial && doGlobalStaleForces) {
3754  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
3755  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
3756  }
3757  if(simParams->rigidBonds != RIGID_NONE){
3758  Tensor reduction_rigidVirial;
3759  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
3760  // Haochuan: the tensor is not symmetric when there are fixed atoms
3761  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
3762  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
3763  }
3764 
3765  // SUBMITHALF reductions
3766  Tensor reduction_virial;
3767  Tensor reduction_intVirialNormal;
3768  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3769  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3770  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3771  // Haochuan: the tensor is not symmetric when there are fixed atoms
3772  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3773  reduction_virial *= 0.5;
3774  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3775 
3776  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY)
3777  += (intKineticEnergy_half[0] * 0.25);
3778  reduction_intVirialNormal *= 0.5;
3779  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3780  reduction_intVirialNormal);
3781 
3782  //submitReductions1
3783  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
3784  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
3785  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
3786  Vector angularMomentum(*angularMomentum_x,
3787  *angularMomentum_y,
3788  *angularMomentum_z);
3789  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
3790  //submitReductions2
3791  Tensor regintVirialNormal;
3792  Tensor regintVirialNbond;
3793  Tensor regintVirialSlow;
3794  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
3795  if (maxForceNumber >= 1) {
3796  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
3797  }
3798  if (maxForceNumber >= 2) {
3799  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
3800  }
3801 
3802  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
3803  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
3804  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
3805  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
3806 
3807  if (simParams->fixedAtomsOn) {
3808  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
3809  double3 fixForceNormal, fixForceNbond, fixForceSlow;
3810  switch (maxForceNumber) {
3811  case 2: {
3812  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
3813  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
3814  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
3815  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
3816  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
3817  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
3818  } // intentionally fallthrough
3819  case 1: {
3820  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
3821  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
3822  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
3823  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
3824  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
3825  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
3826  } // intentionally fallthrough
3827  default: {
3828  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
3829  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
3830  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
3831  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
3832  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
3833  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
3834  }
3835  }
3836 #if 0
3837  auto printTensor = [](const cudaTensor& t, const std::string& name){
3838  CkPrintf("%s", name.c_str());
3839  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
3840  "%12.5lf %12.5lf %12.5lf\n"
3841  "%12.5lf %12.5lf %12.5lf\n",
3842  t.xx, t.xy, t.xz,
3843  t.yx, t.yy, t.yz,
3844  t.zx, t.zy, t.zz);
3845  };
3846  printTensor(fixVirialNormal, "fixVirialNormal = ");
3847  printTensor(fixVirialNbond, "fixVirialNbond = ");
3848  printTensor(fixVirialSlow, "fixVirialSlow = ");
3849 #endif
3850  }
3851  }
3852 }
3853 
3854 // Adding this function back temporarily until GPU migration is merged
3855 #if 1
3856 // This function will aggregate data within a single GPU, so we need to have it copied over pmePositions
3857 void SequencerCUDA::atomUpdatePme()
3858 {
3859  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3861  // We need to find doNbond and doSlow for upcoming step
3862  bool doNbond = false;
3863  bool doSlow = true;
3864 
3865  bool doFEP = false;
3866  bool doTI = false;
3867  bool doAlchDecouple = false;
3868  bool doAlchSoftCore = false;
3869  if (simParams->alchOn) {
3870  if (simParams->alchFepOn) doFEP = true;
3871  if (simParams->alchThermIntOn) doTI = true;
3872  if (simParams->alchDecouple) doAlchDecouple = true;
3873  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3874  }
3875 
3876  if (Node::Object()->molecule->is_lonepairs_psf) {
3877  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3878  }
3879 
3880  bool usePatchPme = false;
3881  if (deviceCUDA->getIsPmeDevice() && doSlow) {
3882  // This will check if the current lattice is compatible with the patch-level PME kernels
3883  // This needs to be redone every time the lattice changes, the results are stored in
3884  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3885  // function.
3886  //
3887  // The behavior of set compute positions PME is different depending on the kernels being used,
3888  // so that value needs to be passed to the kernel object
3890  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3892  myLattice,
3893  getNumPatchesHome(),
3894  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3895  getHostPatchMin(),
3896  getHostPatchMax(),
3897  getHostAwayDists()
3898  );
3899  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3900  }
3901 
3902  std::vector<int> atom_counts;
3903  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3904  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3905  }
3906  CUDASequencerKernel->set_pme_positions(
3907  deviceIndex,
3909  nDevices,
3910  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3911  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3912 #ifdef NAMD_NCCL_ALLREDUCE
3913  (mGpuOn) ? d_posNew_x: d_pos_x,
3914  (mGpuOn) ? d_posNew_y: d_pos_y,
3915  (mGpuOn) ? d_posNew_z: d_pos_z,
3916 #else
3917  d_pos_x,
3918  d_pos_y,
3919  d_pos_z,
3920  d_peer_pos_x, // passes double-pointer if mgpuOn
3921  d_peer_pos_y,
3922  d_peer_pos_z,
3923  d_peer_charge,
3924  d_peer_partition,
3925 #endif
3926  d_charge, d_partition, charge_scaling,
3927  d_patchCenter,
3928  patchData->devData[deviceIndex].slow_patchPositions,
3929  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3930  d_sortOrder, myLattice,
3931  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3932  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3934  patchData->devData[deviceIndex].d_localPatches,
3935  patchData->devData[deviceIndex].d_peerPatches,
3936  atom_counts,
3937  stream);
3938 
3939  cudaCheck(cudaStreamSynchronize(stream));
3940 }
3941 #endif
3942 
3943 
3944 void SequencerCUDA::sync() {
3945  cudaCheck(cudaStreamSynchronize(stream));
3946 }
3947 
3948 void SequencerCUDA::calculateExternalForces(
3949  const int step,
3950  const int maxForceNumber,
3951  const int doEnergy,
3952  const int doVirial) {
3953 
3954  const bool is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3955  const bool is_tip4_water = simParams->watmodel == WaterModel::TIP4;
3956 
3957  if (is_lonepairs_psf) {
3958  lonepairsKernel->redistributeForce(
3959  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3960  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3961  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3962  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
3963  d_pos_x, d_pos_y, d_pos_z, maxForceNumber, doEnergy || doVirial, stream);
3964  }
3965  // TODO: Should I use "else if" here to follow the logic of Sequencer.C?
3966  if (is_tip4_water) {
3967  redistributeTip4pForces(maxForceNumber, doEnergy || doVirial);
3968  }
3969 
3970  if(simParams->eFieldOn){
3971  double3 efield;
3972  efield.x = simParams->eField.x;
3973  efield.y = simParams->eField.y;
3974  efield.z = simParams->eField.z;
3975 
3976  double efield_omega = TWOPI * simParams->eFieldFreq / 1000.;
3977  double efield_phi = PI/180. * simParams->eFieldPhase;
3978  double t = step * simParams->dt;
3979 
3980  CUDASequencerKernel->apply_Efield(numAtomsHome, simParams->eFieldNormalized,
3981  doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
3982  d_charge, d_pos_x, d_pos_y, d_pos_z,
3983  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3984  &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
3985  &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
3986  &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
3987  d_tbcatomic, stream);
3988  }
3989 
3990  if(simParams->constraintsOn){
3991  restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
3992  d_pos_x, d_pos_y, d_pos_z,
3993  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3994  &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
3995  &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
3996  &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
3997  }
3998 
3999  if(simParams->SMDOn){
4000  SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
4001  d_mass, d_pos_x, d_pos_y, d_pos_z, d_transform,
4002  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4003  &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
4004  &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
4005  }
4006 
4007  if(simParams->groupRestraintsOn){
4008  groupRestraintsKernel->doForce(step, doEnergy, doVirial,
4009  myLattice, d_transform,
4010  d_mass, d_pos_x, d_pos_y, d_pos_z,
4011  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4012  &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
4013  &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
4014  }
4015  if(simParams->mgridforceOn || simParams->gridforceOn){
4016  gridForceKernel->doForce(doEnergy, doVirial,
4017  myLattice, step,
4018  d_pos_x, d_pos_y, d_pos_z, d_transform,
4019  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4020  stream);
4021  }
4022  if(simParams->consForceOn){
4023  consForceKernel->doForce(myLattice, doVirial,
4024  d_pos_x, d_pos_y, d_pos_z,
4025  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4026  d_transform,
4027  &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
4028  &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
4029  }
4030 
4031  if(doEnergy || doVirial) {
4032  // Store the external forces and energy data
4033  cudaCheck(cudaStreamSynchronize(stream));
4034  if (is_lonepairs_psf || is_tip4_water) {
4035  switch (maxForceNumber) {
4036  case 2:
4037  copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
4038  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, lpVirialSlow[0]);
4039  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
4040  case 1:
4041  copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
4042  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, lpVirialNbond[0]);
4043  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
4044  case 0:
4045  copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
4046  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, lpVirialNormal[0]);
4047  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
4048  }
4049  }
4050  if(simParams->eFieldOn){
4051  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_ELEC_FIELD];
4052  if (!simParams->eFieldNormalized){
4053  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
4054  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
4055  }
4056  }
4057 
4058  if(simParams->constraintsOn){
4059  reduction->item(REDUCTION_BC_ENERGY) += extEnergy[EXT_CONSTRAINTS];
4060  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
4061  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
4062  }
4063 
4064  if(simParams->SMDOn){
4066  if(cudaMgr->reducerSMDDevice == deviceIndex)
4067  {
4068  // each SMD kernel has the total SMD energy and extForce
4069  // so only one will contribute
4070  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_SMD];
4071  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_SMD]);
4072  }
4073  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_SMD]);
4074  }
4075 
4076  if(simParams->groupRestraintsOn){
4077  // single GPU case, or designated reducer are only contributors for energy
4079  if(!mGpuOn || (deviceCUDA->getDeviceIndex() == cudaMgr->reducerGroupRestraintDevice))
4080  {
4081  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GROUP_RESTRAINTS];
4082  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
4083  }
4084  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
4085  }
4086 
4087  if(simParams->mgridforceOn || simParams->gridforceOn){
4088  // first sum across grids
4089  gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4090  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GRIDFORCE];
4091  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
4092  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
4093  gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4094  }
4095  if(simParams->consForceOn){
4096  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_CONSFORCE];
4097  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
4098  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
4099  }
4100  }
4101 }
4102 
4103 void SequencerCUDA::copyGlobalForcesToDevice(){
4104  // copy the globalMaster forces from host to device.
4105  // Use normal force on host to aggregate it
4106  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
4107  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
4108  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4109  // TODO: determine if this aggregation needs peers and to be home and proxy
4110  for(int i =0 ; i < numPatchesHome; i++){
4111  CudaLocalRecord record = localPatches[i];
4112  const int patchID = record.patchID;
4113  const int stride = record.bufferOffset;
4114  const int numPatchAtoms = record.numAtoms;
4115  PatchDataSOA& current = homePatches[i]->patchDataSOA;
4116  memcpy(f_global_x + stride, current.f_global_x, numPatchAtoms*sizeof(double));
4117  memcpy(f_global_y + stride, current.f_global_y, numPatchAtoms*sizeof(double));
4118  memcpy(f_global_z + stride, current.f_global_z, numPatchAtoms*sizeof(double));
4119  }
4120  // copy aggregated force to device buffer
4121  copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
4122  copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
4123  copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);
4124 
4125 }
4126 
4127 void SequencerCUDA::updateHostPatchDataSOA() {
4128  std::vector<PatchDataSOA> host_copy(numPatchesHome);
4129  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4130 
4131  for(int i =0 ; i < numPatchesHome; i++) {
4132  host_copy[i] = homePatches[i]->patchDataSOA;
4133  }
4134  copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
4135  cudaCheck(cudaDeviceSynchronize());
4136 }
4137 
4138 void SequencerCUDA::saveForceCUDASOA_direct(
4139  const bool doGlobal, const bool doForcesOutput, const int maxForceNumber) {
4140  CUDASequencerKernel->copyForcesToHostSOA(
4141  numPatchesHome,
4142  patchData->devData[deviceIndex].d_localPatches,
4143  maxForceNumber,
4144  d_f_normal_x,
4145  d_f_normal_y,
4146  d_f_normal_z,
4147  d_f_nbond_x,
4148  d_f_nbond_y,
4149  d_f_nbond_z,
4150  d_f_slow_x,
4151  d_f_slow_y,
4152  d_f_slow_z,
4153  d_HostPatchDataSOA,
4154  doGlobal,
4155  doForcesOutput,
4156  stream
4157  );
4158  cudaCheck(cudaStreamSynchronize(stream));
4159 }
4160 
4161 void SequencerCUDA::copyPositionsToHost_direct() {
4162  CUDASequencerKernel->copyPositionsToHostSOA(
4163  numPatchesHome,
4164  patchData->devData[deviceIndex].d_localPatches,
4165  d_pos_x,
4166  d_pos_y,
4167  d_pos_z,
4168  d_HostPatchDataSOA,
4169  stream
4170  );
4171  cudaCheck(cudaStreamSynchronize(stream));
4172 }
4173 
4174 void SequencerCUDA::redistributeTip4pForces(
4175  const int maxForceNumber,
4176  const int doVirial) {
4177  CUDASequencerKernel->redistributeTip4pForces(
4178  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4179  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
4180  d_f_slow_x, d_f_slow_y, d_f_slow_z,
4181  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
4182  d_pos_x, d_pos_y, d_pos_z, d_mass,
4183  numAtomsHome, doVirial, maxForceNumber, stream
4184  );
4185 }
4186 
4187 void SequencerCUDA::allocateGPUSavedForces() {
4188  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
4189  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
4190  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
4191  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
4192  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
4193  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
4194 }
4195 
4196 void SequencerCUDA::submitReductionValues() {
4197  reduction->submit();
4198 }
4199 
4200 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
BigReal yy
Definition: CudaUtils.h:80
double * vel_y
Definition: NamdTypes.h:397
NAMD_HOST_DEVICE void rescale(Tensor factor)
Definition: Lattice.h:60
#define NAMD_EVENT_STOP(eon, id)
int getNumAtoms() const
Definition: Patch.h:105
ScaledPosition getMin()
Definition: HomePatch.h:529
int getDeviceCount()
Definition: DeviceCUDA.h:124
int size(void) const
Definition: ResizeArray.h:131
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
void updateAtomBuffers()
int periodic_a(void) const
Definition: PatchMap.h:73
#define BOLTZMANN
Definition: common.h:54
#define curandCheck(stmt)
Definition: CudaUtils.h:242
double3 ** curSMDCOM
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45
static PatchMap * Object()
Definition: PatchMap.h:27
BigReal zz
Definition: CudaUtils.h:84
BigReal yx
Definition: CudaUtils.h:79
int periodic_c(void) const
Definition: PatchMap.h:75
BigReal alchElecLambdaStart
Definition: Vector.h:72
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
void deallocate_device(T **pp)
Definition: CudaUtils.h:333
BigReal yz
Definition: CudaUtils.h:81
int savePairlists
Definition: PatchTypes.h:41
double * f_global_z
Definition: NamdTypes.h:439
int32 * moleculeAtom
atom index for all molecules
Definition: Molecule.h:622
#define COULOMB
Definition: common.h:53
HomePatchList * homePatchList()
Definition: PatchMap.C:438
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:74
int getNumDevice()
Definition: DeviceCUDA.h:125
int usePairlists
Definition: PatchTypes.h:40
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
PatchID destPatchID
Definition: Migration.h:20
std::ostream & iWARN(std::ostream &s)
Definition: InfoStream.C:82
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
#define iout
Definition: InfoStream.h:51
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
#define COPY_CUDATENSOR(S, D)
Definition: CudaUtils.h:42
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:249
PatchLevelPmeData patchLevelPmeData
Molecule stores the structural information for the system.
Definition: Molecule.h:174
double * pos_y
Definition: NamdTypes.h:378
void setupDevicePeerAccess()
Flags flags
Definition: Patch.h:128
double * f_global_y
Definition: NamdTypes.h:438
std::atomic< int > reducerSMDDevice
const char * get_atomtype(int anum) const
Definition: Molecule.h:1192
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
std::atomic< int > reducerGroupRestraintDevice
#define PI
Definition: common.h:92
void updateAtomCount(const int n, const int reallocate)
#define NAMD_CRASH_ATOM_TOO_FAST
FullAtomList & getAtomList()
Definition: HomePatch.h:528
void copyIntFlags(const Flags &flags)
Definition: PatchTypes.h:74
NAMD_HOST_DEVICE double3 make_double3(float3 a)
Definition: Vector.h:343
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
int pairlistsAge
Definition: Sequencer.h:232
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167
static AtomMap * ObjectOnPe(int pe)
Definition: AtomMap.h:38
int getMasterPe()
Definition: DeviceCUDA.h:137
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
Definition: Molecule.h:620
#define ATOMIC_BINS
Definition: CudaUtils.h:70
void NAMD_bug(const char *err_msg)
Definition: common.C:195
BigReal xx
Definition: CudaUtils.h:76
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:53
static ComputeCUDAMgr * getComputeCUDAMgr()
int doFullElectrostatics
Definition: PatchTypes.h:23
double * vel_x
Jim recommends double precision velocity.
Definition: NamdTypes.h:396
int32 * id
Definition: NamdTypes.h:390
int numMolecules
Number of 1-4 atom pairs with NBThole defined.
Definition: Molecule.h:619
void allocate_host(T **pp, const size_t len)
Definition: CudaUtils.h:296
ScaledPosition getMax()
Definition: HomePatch.h:530
BigReal zx
Definition: CudaUtils.h:82
int32 * transform_i
Definition: NamdTypes.h:410
BigReal x
Definition: Vector.h:74
PatchID getPatchID() const
Definition: Patch.h:114
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
int numAtoms
Definition: Molecule.h:586
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:147
int periodic_b(void) const
Definition: PatchMap.h:74
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
BigReal xz
Definition: CudaUtils.h:78
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
BigReal maxAtomMovement
Definition: PatchTypes.h:43
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164
BigReal xx
Definition: Tensor.h:17
#define TWOPI
Definition: common.h:96
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:427
BigReal zz
Definition: Tensor.h:19
int32 * transform_k
Definition: NamdTypes.h:412
#define simParams
Definition: Output.C:131
iterator begin(void)
Definition: ResizeArray.h:36
double * pos_z
Definition: NamdTypes.h:379
const PatchID patchID
Definition: Patch.h:150
Definition: Tensor.h:15
double * pos_x
Definition: NamdTypes.h:377
BigReal y
Definition: Vector.h:74
bool getIsPmeDevice()
Definition: DeviceCUDA.h:168
int32 * transform_j
Definition: NamdTypes.h:411
double * vel_z
Definition: NamdTypes.h:398
#define ADD_VECTOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:28
BigReal yy
Definition: Tensor.h:18
#define PDBVELFACTOR
Definition: common.h:57
int32 * moleculeStartIndex
starting index of each molecule
Definition: Molecule.h:621
int getDeviceIndex()
Definition: DeviceCUDA.h:166
BigReal pairlistTolerance
Definition: PatchTypes.h:42
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
bool getIsMasterDevice()
Definition: DeviceCUDA.C:642
double * f_global_x
Definition: NamdTypes.h:437
int pairlistsAgeLimit
Definition: Sequencer.h:233
int pairlistsAreValid
Definition: Sequencer.h:231
size_t alchGetNumOfPMEGrids() const
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int destPatchID[3][3][3]
Definition: CudaUtils.h:123
int is_lonepairs_psf
Definition: Molecule.h:491
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
#define namd_reciprocal(x)
Definition: Vector.h:69
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:525
#define RIGID_NONE
Definition: SimParameters.h:80
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
SimParameters *const simParams
Definition: Sequencer.h:322
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
Molecule * molecule
Definition: Node.h:179
BigReal zy
Definition: CudaUtils.h:83
BigReal xy
Definition: CudaUtils.h:77
double BigReal
Definition: common.h:123
CudaPmeOneDevice * getCudaPmeOneDevice()
int32 numAtoms
number of atoms
Definition: NamdTypes.h:456