NAMD
SequencerCUDA.C
Go to the documentation of this file.
1 #include "ComputeLonepairsCUDA.h"
2 #include "CudaUtils.h"
3 #include "ReductionMgr.h"
4 #include "SequencerCUDA.h"
5 #include "ComputeNonbondedUtil.h"
6 #include "DeviceCUDA.h"
7 #include "TestArray.h"
9 #include "ComputeGridForceCUDA.h"
10 #include "ComputeConsForceCUDA.h"
11 #include "NamdEventsProfiling.h"
12 #include "AtomMap.h"
13 #include "common.h"
14 #include <algorithm> // std::fill()
16 //#define DEBUGM
17 //#define MIN_DEBUG_LEVEL 3
18 #ifdef NODEGROUP_FORCE_REGISTER
19 extern __thread DeviceCUDA *deviceCUDA;
20 
21 #if 1
22 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
23  size_t offset = 0; \
24  for (int i = 0; i < numPatchesHome; i++) { \
25  PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \
26  const int numPatchAtoms = current.numAtoms; \
27  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
28  offset += numPatchAtoms; \
29  } \
30  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream); \
31  } while(0);
32 
33 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
34  size_t offset = 0; \
35  for (int i = 0; i < numPatchesHomeAndProxy; i++) { \
36  PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \
37  const int numPatchAtoms = current.numAtoms; \
38  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
39  offset += numPatchAtoms; \
40  } \
41  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream); \
42  } while(0);
43 
44 #else
45 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \
46  size_t offset = 0; \
47  for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \
48  PatchDataSOA& current = elem->patch->patchDataSOA; \
49  const int numPatchAtoms = current.numAtoms; \
50  memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \
51  offset += numPatchAtoms; \
52  } \
53  copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream); \
54  } while(0);
55 #endif
56 
57 // This function sets whatever SOA pointer I have an
58 void SequencerCUDA::registerSOAPointersToHost(){
59  // fprintf(stderr, "Dev[%d] setting memories\n", deviceID);
60  patchData->h_soa_pos_x[deviceIndex] = this->d_pos_x;
61  patchData->h_soa_pos_y[deviceIndex] = this->d_pos_y;
62  patchData->h_soa_pos_z[deviceIndex] = this->d_pos_z;
63 
64  patchData->h_soa_vel_x[deviceIndex] = this->d_vel_x;
65  patchData->h_soa_vel_y[deviceIndex] = this->d_vel_y;
66  patchData->h_soa_vel_z[deviceIndex] = this->d_vel_z;
67 
68  patchData->h_soa_fb_x[deviceIndex] = this->d_f_normal_x;
69  patchData->h_soa_fb_y[deviceIndex] = this->d_f_normal_y;
70  patchData->h_soa_fb_z[deviceIndex] = this->d_f_normal_z;
71 
72  patchData->h_soa_fn_x[deviceIndex] = this->d_f_nbond_x;
73  patchData->h_soa_fn_y[deviceIndex] = this->d_f_nbond_y;
74  patchData->h_soa_fn_z[deviceIndex] = this->d_f_nbond_z;
75 
76  patchData->h_soa_fs_x[deviceIndex] = this->d_f_slow_x;
77  patchData->h_soa_fs_y[deviceIndex] = this->d_f_slow_y;
78  patchData->h_soa_fs_z[deviceIndex] = this->d_f_slow_z;
79 
80  patchData->h_soa_id[deviceIndex] = this->d_idMig;
81  patchData->h_soa_vdwType[deviceIndex] = this->d_vdwType;
82  patchData->h_soa_sortOrder[deviceIndex] = this->d_sortOrder;
83  patchData->h_soa_unsortOrder[deviceIndex] = this->d_unsortOrder;
84  patchData->h_soa_charge[deviceIndex] = this->d_charge;
85  patchData->h_soa_patchCenter[deviceIndex] = this->d_patchCenter;
86  patchData->h_soa_migrationDestination[deviceIndex] = this->d_migrationDestination;
87  patchData->h_soa_sortSoluteIndex[deviceIndex] = this->d_sortSoluteIndex;
88 
89  patchData->h_soa_partition[deviceIndex] = this->d_partition;
90 
91  patchData->h_atomdata_AoS[deviceIndex] = (FullAtom*) this->d_atomdata_AoS_in;
92  patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
93 }
94 
95 void SequencerCUDA::copySOAHostRegisterToDevice(){
96  // This function gets the host-registered SOA device pointers and copies the register itself to the device
97  // NOTE: This needs to be called only when ALL masterPEs have safely called ::registerSOAPointersToHost()
98  cudaCheck(cudaSetDevice(deviceID));
99  copy_HtoD<double*>(patchData->h_soa_pos_x, this->d_peer_pos_x, nDevices, stream);
100  copy_HtoD<double*>(patchData->h_soa_pos_y, this->d_peer_pos_y, nDevices, stream);
101  copy_HtoD<double*>(patchData->h_soa_pos_z, this->d_peer_pos_z, nDevices, stream);
102 
103  copy_HtoD<float*>(patchData->h_soa_charge, this->d_peer_charge, nDevices, stream);
104  if (simParams->alchOn) {
105  copy_HtoD<int*>(patchData->h_soa_partition, this->d_peer_partition, nDevices, stream);
106  }
107 
108  copy_HtoD<double*>(patchData->h_soa_vel_x, this->d_peer_vel_x, nDevices, stream);
109  copy_HtoD<double*>(patchData->h_soa_vel_y, this->d_peer_vel_y, nDevices, stream);
110  copy_HtoD<double*>(patchData->h_soa_vel_z, this->d_peer_vel_z, nDevices, stream);
111 
112  copy_HtoD<double*>(patchData->h_soa_fb_x, this->d_peer_fb_x, nDevices, stream);
113  copy_HtoD<double*>(patchData->h_soa_fb_y, this->d_peer_fb_y, nDevices, stream);
114  copy_HtoD<double*>(patchData->h_soa_fb_z, this->d_peer_fb_z, nDevices, stream);
115 
116  copy_HtoD<double*>(patchData->h_soa_fn_x, this->d_peer_fn_x, nDevices, stream);
117  copy_HtoD<double*>(patchData->h_soa_fn_y, this->d_peer_fn_y, nDevices, stream);
118  copy_HtoD<double*>(patchData->h_soa_fn_z, this->d_peer_fn_z, nDevices, stream);
119 
120  copy_HtoD<double*>(patchData->h_soa_fs_x, this->d_peer_fs_x, nDevices, stream);
121  copy_HtoD<double*>(patchData->h_soa_fs_y, this->d_peer_fs_y, nDevices, stream);
122  copy_HtoD<double*>(patchData->h_soa_fs_z, this->d_peer_fs_z, nDevices, stream);
123 
124  copy_HtoD<int4*>(patchData->h_soa_migrationDestination, this->d_peer_migrationDestination, nDevices, stream);
125  copy_HtoD<int*>(patchData->h_soa_sortSoluteIndex, this->d_peer_sortSoluteIndex, nDevices, stream);
126 
127  copy_HtoD<int*>(patchData->h_soa_id, this->d_peer_id, nDevices, stream);
128  copy_HtoD<int*>(patchData->h_soa_vdwType, this->d_peer_vdwType, nDevices, stream);
129  copy_HtoD<int*>(patchData->h_soa_sortOrder, this->d_peer_sortOrder, nDevices, stream);
130  copy_HtoD<int*>(patchData->h_soa_unsortOrder, this->d_peer_unsortOrder, nDevices, stream);
131  copy_HtoD<double3*>(patchData->h_soa_patchCenter, this->d_peer_patchCenter, nDevices, stream);
132 
133  copy_HtoD<FullAtom*>(patchData->h_atomdata_AoS, this->d_peer_atomdata, nDevices, stream);
134  copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);
135 
136  // aggregate device pointers
137  for(int i = 0; i < this->nDevices; i++)
138  h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
139  copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
140 }
141 
142 void SequencerCUDA::printSOAPositionsAndVelocities() {
143  BigReal* h_pos_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
144  BigReal* h_pos_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
145  BigReal* h_pos_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
146 
147  BigReal* h_vel_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
148  BigReal* h_vel_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
149  BigReal* h_vel_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
150 
151  // DMC think this condition was a holdover from the NCCL code path
152  if(false && mGpuOn){
153  copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
154  copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
155  copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
156  }else{
157  copy_DtoH_sync<BigReal>(d_pos_x, h_pos_x, numAtomsHome);
158  copy_DtoH_sync<BigReal>(d_pos_y, h_pos_y, numAtomsHome);
159  copy_DtoH_sync<BigReal>(d_pos_z, h_pos_z, numAtomsHome);
160  }
161 
162  copy_DtoH_sync<BigReal>(d_vel_x, h_vel_x, numAtomsHome);
163  copy_DtoH_sync<BigReal>(d_vel_y, h_vel_y, numAtomsHome);
164  copy_DtoH_sync<BigReal>(d_vel_z, h_vel_z, numAtomsHome);
165 
166  CmiLock(this->patchData->printlock);
167  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
168  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
169  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
170  for(int i =0 ; i < numPatchesHome; i++){
171  CudaLocalRecord record = localPatches[i];
172  const int patchID = record.patchID;
173  const int stride = record.bufferOffset;
174  const int numPatchAtoms = record.numAtoms;
175  PatchDataSOA& current = homePatches[i]->patchDataSOA;
176 
177  fprintf(stderr, "Patch [%d]:\n", patchID);
178  for(int j = 0; j < numPatchAtoms; j++){
179  fprintf(stderr, " [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.id[j],
180  h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
181  h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
182  }
183 
184  }
185  CmiUnlock(this->patchData->printlock);
186 
187  free(h_pos_x);
188  free(h_pos_y);
189  free(h_pos_z);
190  free(h_vel_x);
191  free(h_vel_y);
192  free(h_vel_z);
193 }
194 
195 void SequencerCUDA::printSOAForces(char *prefix) {
196  BigReal* h_f_normal_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
197  BigReal* h_f_normal_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
198  BigReal* h_f_normal_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
199 
200  BigReal* h_f_nbond_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
201  BigReal* h_f_nbond_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
202  BigReal* h_f_nbond_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
203 
204  BigReal* h_f_slow_x = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
205  BigReal* h_f_slow_y = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
206  BigReal* h_f_slow_z = (BigReal*)malloc(sizeof(BigReal)*numAtomsHome);
207 
208  copy_DtoH_sync<BigReal>(d_f_normal_x, h_f_normal_x, numAtomsHome);
209  copy_DtoH_sync<BigReal>(d_f_normal_y, h_f_normal_y, numAtomsHome);
210  copy_DtoH_sync<BigReal>(d_f_normal_z, h_f_normal_z, numAtomsHome);
211 
212  copy_DtoH_sync<BigReal>(d_f_nbond_x, h_f_nbond_x, numAtomsHome);
213  copy_DtoH_sync<BigReal>(d_f_nbond_y, h_f_nbond_y, numAtomsHome);
214  copy_DtoH_sync<BigReal>(d_f_nbond_z, h_f_nbond_z, numAtomsHome);
215 
216  copy_DtoH_sync<BigReal>(d_f_slow_x, h_f_slow_x, numAtomsHome);
217  copy_DtoH_sync<BigReal>(d_f_slow_y, h_f_slow_y, numAtomsHome);
218  copy_DtoH_sync<BigReal>(d_f_slow_z, h_f_slow_z, numAtomsHome);
219 
220  // Great, now let's
221  CmiLock(this->patchData->printlock);
222  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
223  char fname[100];
224  fprintf(stderr, "PE[%d] force printout\n", CkMyPe());
225  for(int i =0 ; i < numPatchesHome; i++){
226  CudaLocalRecord record = localPatches[i];
227  const int patchID = record.patchID;
228  const int stride = record.bufferOffset;
229  const int numPatchAtoms = record.numAtoms;
230  FILE *outfile=stderr;
231  if(prefix!=NULL)
232  {
233  snprintf(fname,100, "%s-patch-%d", prefix, patchID);
234  outfile = fopen(fname, "w");
235  }
236  fprintf(outfile, "Patch [%d]:\n", patchID);
237  for(int j = 0; j < numPatchAtoms; j++){
238  fprintf(outfile, " [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
239  h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
240  h_f_nbond_x[stride+j], h_f_nbond_y[stride+j], h_f_nbond_z[stride+j],
241  h_f_slow_x[stride+j], h_f_slow_y[stride+j], h_f_slow_z[stride+j] );
242  }
243  if(prefix!=NULL) fclose(outfile);
244  }
245 
246  CmiUnlock(this->patchData->printlock);
247 
248  free(h_f_normal_x);
249  free(h_f_normal_y);
250  free(h_f_normal_z);
251 
252  free(h_f_nbond_x);
253  free(h_f_nbond_y);
254  free(h_f_nbond_z);
255 
256  free(h_f_slow_x);
257  free(h_f_slow_y);
258  free(h_f_slow_z);
259 
260 }
261 
262 SequencerCUDA* SequencerCUDA::InstanceInit(const int deviceID_ID,
263  SimParameters *const sim_Params) {
264  if (CkpvAccess(SequencerCUDA_instance) == 0) {
265  CkpvAccess(SequencerCUDA_instance) = new SequencerCUDA(deviceID_ID, sim_Params);
266  }
267  return CkpvAccess(SequencerCUDA_instance);
268 }
269 
270 SequencerCUDA::SequencerCUDA(const int deviceID_ID,
271  SimParameters *const sim_Params):
272  deviceID(deviceID_ID), simParams(sim_Params)
273 {
274  restraintsKernel = NULL;
275  SMDKernel = NULL;
276  groupRestraintsKernel = NULL;
277  gridForceKernel = NULL;
278  consForceKernel = NULL;
279  lonepairsKernel = nullptr;
280 #if 1
281  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
282  patchData = cpdata.ckLocalBranch();
283 #endif
284  initialize();
285  CUDASequencerKernel = new SequencerCUDAKernel();
286  CUDAMigrationKernel = new MigrationCUDAKernel();
287  num_used_grids = simParams->alchGetNumOfPMEGrids();
288  used_grids.resize(num_used_grids, 0);
289  if (simParams->alchFepOn) {
290  // at least two grids are used
291  used_grids[0] = 0;
292  used_grids[1] = 1;
293  // if alchDecouple then two more grids are used
294  if (simParams->alchDecouple) {
295  used_grids[2] = 2;
296  used_grids[3] = 3;
297  // an extra for soft-core potential
298  if (simParams->alchElecLambdaStart > 0) {
299  used_grids[4] = 4;
300  }
301  } else {
302  // in this case alchDecouple is false
303  // but if there is still soft-core potential
304  // then a total of 3 grids are used
305  // mark the last grid for soft-core potential
306  if (simParams->alchElecLambdaStart > 0) {
307  used_grids[2] = 4;
308  }
309  }
310  }
311  if (simParams->alchThermIntOn) {
312  used_grids[0] = 0;
313  used_grids[1] = 1;
314  // in TI, no matter whether soft-core potential is used
315  // it has at least three grids
316  if (simParams->alchDecouple) {
317  used_grids[2] = 2;
318  used_grids[3] = 3;
319  used_grids[4] = 4;
320  } else {
321  used_grids[2] = 4;
322  }
323  }
324 }
325 
326 SequencerCUDA::~SequencerCUDA(){
327  cudaCheck(cudaSetDevice(deviceID));
328  deallocateArrays();
329  deallocateStaticArrays();
330  deallocate_device<SettleParameters>(&sp);
331  deallocate_device<int>(&settleList);
332  deallocate_device<CudaRattleElem>(&rattleList);
333  deallocate_device<int>(&d_consFailure);
334  if (CUDASequencerKernel != NULL) delete CUDASequencerKernel;
335  if (CUDAMigrationKernel != NULL) delete CUDAMigrationKernel;
336  if (restraintsKernel != NULL) delete restraintsKernel;
337  if(SMDKernel != NULL) delete SMDKernel;
338  if (groupRestraintsKernel != NULL) delete groupRestraintsKernel;
339  if (gridForceKernel != NULL) delete gridForceKernel;
340  if (lonepairsKernel != nullptr) delete lonepairsKernel;
341  if (consForceKernel != NULL) delete consForceKernel;
342 #if 0
343  cudaCheck(cudaStreamDestroy(stream));
344 #endif
345  cudaCheck(cudaStreamDestroy(stream2));
346  curandCheck(curandDestroyGenerator(curandGen));
347  CmiDestroyLock(printlock);
348 
349 }
350 
351 void SequencerCUDA::zeroScalars(){
352  numAtomsHomeAndProxyAllocated = 0;
353  numAtomsHomeAllocated = 0;
354  buildRigidLists = true;
355  numPatchesCheckedIn = 0;
356  numPatchesReady= 0;
357 }
358 
359 void SequencerCUDA::initialize(){
360  cudaCheck(cudaSetDevice(deviceID));
361  nDevices = deviceCUDA->getNumDevice() + 1 *deviceCUDA->isGpuReservedPme();
362  deviceIndex = deviceCUDA->getDeviceIndex();
363 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
364  int leastPriority, greatestPriority;
365  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
366 #if 0
367  cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
368 #else
369  stream = (CkMyPe() == deviceCUDA->getMasterPe()) ? patchData->devData[deviceIndex].nbond_stream : 0;
370 #endif
371  cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
372 #else
373  cudaCheck(cudaStreamCreate(&stream));
374  cudaCheck(cudaStreamCreate(&stream2));
375 #endif
376  curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
377  // each PE's seed needs to be different
378  unsigned long long seed = simParams->randomSeed + CkMyPe();
379  curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));
380 
381  numAtomsHomeAllocated = 0;
382  numAtomsHomeAndProxyAllocated = 0;
383 
384  totalMarginViolations = 0;
385  buildRigidLists = true;
386  numPatchesCheckedIn = 0;
387  numPatchesReady= 0;
388  PatchMap* patchMap = PatchMap::Object();
389 #if 1
390  numPatchesGlobal = patchMap->numPatches();
391 #else
392  numPatchesGlobal = patchMap->homePatchList()->size();
393 #endif
394  mGpuOn = nDevices > 1;
395  // Great, now allocate the queue
396  // allocate_device<unsigned int>(&deviceQueue, nDevices);
397  // cudaCheck(cudaMemset(deviceQueue, 999, sizeof(unsigned int)* nDevices));
398  // Allocates and registers the local queue in patchData
399  // patchData->d_queues[deviceIndex] = deviceQueue;
400 
401  printlock = CmiCreateLock();
402 
403  const int numPes = CkNumPes();
404  if (!mGpuOn) {
405  atomMapList.resize(numPes);
406  }
407 
408  //Allocates SOA registers
409  allocate_device<double*>(&d_peer_pos_x, nDevices);
410  allocate_device<double*>(&d_peer_pos_y, nDevices);
411  allocate_device<double*>(&d_peer_pos_z, nDevices);
412  allocate_device<float*>(&d_peer_charge, nDevices);
413  if (simParams->alchOn) {
414  allocate_device<int*>(&d_peer_partition, nDevices);
415  }
416  allocate_device<double*>(&d_peer_vel_x, nDevices);
417  allocate_device<double*>(&d_peer_vel_y, nDevices);
418  allocate_device<double*>(&d_peer_vel_z, nDevices);
419 
420  if (simParams->fixedAtomsOn) {
421  allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
422  allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
423  allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
424  allocate_device<double3>(&d_fixForceNormal, 1);
425  allocate_device<double3>(&d_fixForceNbond, 1);
426  allocate_device<double3>(&d_fixForceSlow, 1);
427  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
428  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
429  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
430  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
431  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
432  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
433  }
434 
435  allocate_device<double*>(&d_peer_fb_x, nDevices);
436  allocate_device<double*>(&d_peer_fb_y, nDevices);
437  allocate_device<double*>(&d_peer_fb_z, nDevices);
438  allocate_device<double*>(&d_peer_fn_x, nDevices);
439  allocate_device<double*>(&d_peer_fn_y, nDevices);
440  allocate_device<double*>(&d_peer_fn_z, nDevices);
441  allocate_device<double*>(&d_peer_fs_x, nDevices);
442  allocate_device<double*>(&d_peer_fs_y, nDevices);
443  allocate_device<double*>(&d_peer_fs_z, nDevices);
444 
445  allocate_device<int4*>(&d_peer_migrationDestination, nDevices);
446  allocate_device<int*>(&d_peer_sortSoluteIndex, nDevices);
447  allocate_device<int*>(&d_peer_id, nDevices);
448  allocate_device<int*>(&d_peer_vdwType, nDevices);
449  allocate_device<int*>(&d_peer_sortOrder, nDevices);
450  allocate_device<int*>(&d_peer_unsortOrder, nDevices);
451  allocate_device<double3*>(&d_peer_patchCenter, nDevices);
452 
453  allocate_device<FullAtom*>(&d_peer_atomdata, nDevices);
454  allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);
455 
456  allocate_device<bool*>(&d_patchRecordHasForces, nDevices);
457 
458  allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
459  // Patch-related host datastructures
460  allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
461  allocate_host<double3>(&patchCenter, numPatchesGlobal);
462  allocate_host<int>(&globalToLocalID, numPatchesGlobal);
463  allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
464  allocate_host<double3>(&awayDists, numPatchesGlobal);
465  allocate_host<double3>(&patchMin, numPatchesGlobal);
466  allocate_host<double3>(&patchMax, numPatchesGlobal);
467  //allocate_host<Lattice>(&lattices, numPatchesGlobal);
468  allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal); // only needed for langevin
469  allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
470  allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
471  allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);
472 
473  //Patch-related device datastructures
474  allocate_device<double3>(&d_awayDists, numPatchesGlobal);
475  allocate_device<double3>(&d_patchMin, numPatchesGlobal);
476  allocate_device<double3>(&d_patchMax, numPatchesGlobal);
477  allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
478  allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
479  allocate_device<double3>(&d_patchCenter, numPatchesGlobal);
480  allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
481  allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
482  allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
483  allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
484  allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);
485 
486  // Allocate host memory for scalar variables
487  allocate_device<int>(&d_killme, 1);
488  allocate_device<char>(&d_barrierFlag, 1);
489  allocate_device<unsigned int>(&d_tbcatomic, 5);
490  allocate_device<BigReal>(&d_kineticEnergy, ATOMIC_BINS);
491  allocate_device<BigReal>(&d_intKineticEnergy, ATOMIC_BINS);
492  allocate_device<BigReal>(&d_momentum_x, ATOMIC_BINS);
493  allocate_device<BigReal>(&d_momentum_y, ATOMIC_BINS);
494  allocate_device<BigReal>(&d_momentum_z, ATOMIC_BINS);
495  allocate_device<BigReal>(&d_angularMomentum_x, ATOMIC_BINS);
496  allocate_device<BigReal>(&d_angularMomentum_y, ATOMIC_BINS);
497  allocate_device<BigReal>(&d_angularMomentum_z, ATOMIC_BINS);
498  allocate_device<cudaTensor>(&d_virial, ATOMIC_BINS);
499  allocate_device<cudaTensor>(&d_intVirialNormal, ATOMIC_BINS);
500  allocate_device<cudaTensor>(&d_intVirialNbond, ATOMIC_BINS);
501  allocate_device<cudaTensor>(&d_intVirialSlow, ATOMIC_BINS);
502  allocate_device<cudaTensor>(&d_rigidVirial, ATOMIC_BINS);
503  // for lone pairs
504  allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
505  allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
506  allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
507  //space for globalmaster forces on device
508  allocate_device<cudaTensor>(&d_extVirial, ATOMIC_BINS * EXT_FORCE_TOTAL);
509  allocate_device<double3>(&d_extForce, ATOMIC_BINS * EXT_FORCE_TOTAL);
510  allocate_device<double>(&d_extEnergy, ATOMIC_BINS * EXT_FORCE_TOTAL);
511 
512  allocate_device<SettleParameters>(&sp, 1);
513 
514  //allocates host scalars in host-mapped, pinned memory
515  allocate_host<int>(&killme, 1);
516  allocate_host<BigReal>(&kineticEnergy, 1);
517  allocate_host<BigReal>(&intKineticEnergy, 1);
518  allocate_host<BigReal>(&kineticEnergy_half, 1);
519  allocate_host<BigReal>(&intKineticEnergy_half, 1);
520  allocate_host<BigReal>(&momentum_x, 1);
521  allocate_host<BigReal>(&momentum_y, 1);
522  allocate_host<BigReal>(&momentum_z, 1);
523  allocate_host<BigReal>(&angularMomentum_x, 1);
524  allocate_host<BigReal>(&angularMomentum_y, 1);
525  allocate_host<BigReal>(&angularMomentum_z, 1);
526  allocate_host<int>(&consFailure, 1);
527  allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
528  allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
529  allocate_host<unsigned int>(&h_marginViolations, 1);
530  allocate_host<unsigned int>(&h_periodicCellSmall, 1);
531 
532  // allocates host cudaTensors in host-mapped, pinned memory
533  allocate_host<cudaTensor>(&virial, 1);
534  allocate_host<cudaTensor>(&virial_half, 1);
535  allocate_host<cudaTensor>(&intVirialNormal, 1);
536  allocate_host<cudaTensor>(&intVirialNormal_half, 1);
537  allocate_host<cudaTensor>(&intVirialNbond, 1);
538  allocate_host<cudaTensor>(&intVirialSlow, 1);
539  allocate_host<cudaTensor>(&rigidVirial, 1);
540  allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
541  allocate_host<cudaTensor>(&lpVirialNormal, 1);
542  allocate_host<cudaTensor>(&lpVirialNbond, 1);
543  allocate_host<cudaTensor>(&lpVirialSlow, 1);
544 
545  // These arrays will get allocated when total forces of the GPU global calculations are requested
546  d_f_saved_nbond_x = nullptr;
547  d_f_saved_nbond_y = nullptr;
548  d_f_saved_nbond_z = nullptr;
549  d_f_saved_slow_x = nullptr;
550  d_f_saved_slow_y = nullptr;
551  d_f_saved_slow_z = nullptr;
552 
553  //Sets values for scalars
554  *kineticEnergy = 0.0;
555  *intKineticEnergy = 0.0;
556  *kineticEnergy_half = 0.0;
557  *intKineticEnergy_half = 0.0;
558  *momentum_x = 0.0;
559  *momentum_y = 0.0;
560  *momentum_z = 0.0;
561  *angularMomentum_x = 0.0;
562  *angularMomentum_y = 0.0;
563  *angularMomentum_z = 0.0;
564  *consFailure = 0;
565  *killme = 0;
566 
567  // JM: Basic infrastructure to time kernels
568  // XXX TODO: Add timing functions to be switched on/off for each of these
569  t_total = 0;
570  t_vverlet = 0;
571  t_pairlistCheck = 0;
572  t_setComputePositions = 0;
573  t_accumulateForceKick = 0;
574  t_rattle = 0;
575  t_submitHalf = 0;
576  t_submitReductions1 = 0;
577  t_submitReductions2 = 0;
578 
579  cudaEventCreate(&eventStart);
580  cudaEventCreate(&eventStop);
581  cudaCheck(cudaMemset(d_patchNewTolerance, 0, sizeof(BigReal)*numPatchesGlobal));
582  cudaCheck(cudaMemset(d_kineticEnergy, 0, sizeof(BigReal)));
583  cudaCheck(cudaMemset(d_tbcatomic, 0, sizeof(unsigned int) * 5));
584  cudaCheck(cudaMemset(d_momentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
585  cudaCheck(cudaMemset(d_momentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
586  cudaCheck(cudaMemset(d_momentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
587  cudaCheck(cudaMemset(d_angularMomentum_x, 0, ATOMIC_BINS * sizeof(BigReal)));
588  cudaCheck(cudaMemset(d_angularMomentum_y, 0, ATOMIC_BINS * sizeof(BigReal)));
589  cudaCheck(cudaMemset(d_angularMomentum_z, 0, ATOMIC_BINS * sizeof(BigReal)));
590  cudaCheck(cudaMemset(d_intKineticEnergy, 0, ATOMIC_BINS * sizeof(BigReal)));
591  cudaCheck(cudaMemset(d_virial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
592  cudaCheck(cudaMemset(d_rigidVirial, 0, ATOMIC_BINS * sizeof(cudaTensor)));
593  cudaCheck(cudaMemset(d_intVirialNormal, 0, ATOMIC_BINS * sizeof(cudaTensor)));
594  cudaCheck(cudaMemset(d_intVirialNbond, 0, ATOMIC_BINS * sizeof(cudaTensor)));
595  cudaCheck(cudaMemset(d_intVirialSlow, 0, ATOMIC_BINS * sizeof(cudaTensor)));
596  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
597  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
598  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
599  cudaCheck(cudaMemset(d_extVirial, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(cudaTensor)));
600  cudaCheck(cudaMemset(d_extForce, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double3)));
601  cudaCheck(cudaMemset(d_extEnergy, 0, ATOMIC_BINS * EXT_FORCE_TOTAL * sizeof(double)));
602 
603  memset(h_marginViolations, 0, sizeof(unsigned int));
604  memset(h_periodicCellSmall, 0, sizeof(unsigned int));
605  memset(virial, 0, sizeof(cudaTensor));
606  memset(rigidVirial, 0, sizeof(cudaTensor));
607  memset(intVirialNormal, 0, sizeof(cudaTensor));
608  memset(intVirialNbond, 0, sizeof(cudaTensor));
609  memset(intVirialSlow, 0, sizeof(cudaTensor));
610  memset(lpVirialNormal, 0, sizeof(cudaTensor));
611  memset(lpVirialNbond, 0, sizeof(cudaTensor));
612  memset(lpVirialSlow, 0, sizeof(cudaTensor));
613  memset(globalToLocalID, -1, sizeof(int)*numPatchesGlobal);
614 
615  settleList = NULL;
616  settleListSize = 0;
617  rattleList = NULL;
618  rattleListSize = 0;
619  d_consFailure = NULL;
620  d_consFailureSize = 0;
621 
622  // JM: I can bundle the CompAtom and CudaAtomList pointers from the
623  // patches here and fill the Host-arrays after integration. :]
624  // if single-gpu simulation, numPatchesHome is the same as numPatchesGlobal
625  numPatchesHome = numPatchesGlobal;
626 
627 
628  int count = 0;
629  // Single-GPU case
630  if(!mGpuOn){
631  // JM NOTE: This works for a single device only, but it's fast if we want to have multiple-PE's sharing it
632  if(deviceCUDA->getMasterPe() == CkMyPe()){
633  for(int i = 0; i < numPes; i++) {
634  atomMapList[i] = AtomMap::ObjectOnPe(i);
635  PatchMap* patchMap = PatchMap::ObjectOnPe(i);
636  int npatch = patchMap->numPatches();
637  for (int j = 0; j < npatch; j++) {
638  HomePatch *patch = patchMap->homePatch(j);
639  // JM NOTE: This data structure can be preserved through migration steps
640  if(patch != NULL) {
641  patchList.push_back(patch);
642  patchNewTolerance[count++] =
643  0.5 * ( simParams->pairlistDist - simParams->cutoff );
644  numAtomsHomeAndProxy += patchMap->patch(j)->getNumAtoms();
645 
646  patchData->devData[deviceIndex].patches.push_back(patch);
647  patchListHomeAndProxy.push_back(patch);
648  }
649  }
650  }
651  }
652  }else{
653  // Multi-device case
654  /* The logic here is a bit trickier than the one on the single-GPU case.
655  Each GPU will only allocate space for its home patches and any required proxy patches,
656  This is going to eliminate the possibility of using NCCL-based all reduce for communication
657  but DMC thinks this is okay...
658  */
659  if(deviceCUDA->getMasterPe() == CkMyPe()){
660  // Haochuan: For multi-GPU cases, we need to find the atom maps from
661  // all "non-master PEs" that use the same device as the master PE.
662  // 1. Get the current device ID.
663  const int currentDevice = deviceCUDA->getDeviceIDforPe(CkMyPe());
664  // 2. Iterate over all PEs and find those that have the same device ID as this master PE
665  for (int i = 0; i < numPes; ++i) {
666  if (deviceCUDA->getDeviceIDforPe(i) == currentDevice) {
667  atomMapList.push_back(AtomMap::ObjectOnPe(i));
668  }
669  }
670  for(int i = 0; i < deviceCUDA->getNumPesSharingDevice(); i++){
672  for (int j = 0; j < pm->numPatches(); j++) {
673  // Aggregates the patches in a separate data structure for now
674  HomePatch *patch = pm->homePatch(j);
675  if(patch != NULL) {
676  patchData->devData[deviceIndex].patches.push_back(patch);
677  }
678  }
679  }
680 
681  // if MGPU is On, we also try to set up peer access
683 #ifdef NAMD_NCCL_ALLREDUCE
684  deviceCUDA->setNcclUniqueId(patchData->ncclId);
685  deviceCUDA->setupNcclComm();
686 #endif
687 
688  }
689  }
690  PatchMap *pm = PatchMap::Object();
691  isPeriodic = (pm->periodic_a() && pm->periodic_b() && pm->periodic_c());
692 
693  // XXX TODO decide how the biasing methods will work in the future -
694  // for multiple devices this will be a problem, how to solve it?
695  if (simParams->constraintsOn) {
696  restraintsKernel = new ComputeRestraintsCUDA(patchList, atomMapList,
697  stream, mGpuOn);
698  }
699  if (simParams->SMDOn) {
700  SMDKernel = new ComputeSMDCUDA(patchList, simParams->SMDk, simParams->SMDk2,
701  simParams->SMDVel, make_double3(simParams->SMDDir.x, simParams->SMDDir.y, simParams->SMDDir.z),
702  simParams->SMDOutputFreq, simParams->firstTimestep, simParams->SMDFile,
704  }
705  if (simParams->groupRestraintsOn) {
706  groupRestraintsKernel = new ComputeGroupRestraintsCUDA(simParams->outputEnergies,
707  simParams->groupRestraints);
708  }
709  if (simParams->mgridforceOn || simParams->gridforceOn ){
710  if(deviceCUDA->getMasterPe() == CkMyPe()){
711  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
712  }
713  }
714  if (Node::Object()->molecule->is_lonepairs_psf) {
715  lonepairsKernel = new ComputeLonepairsCUDA();
716  }
717  if (simParams->consForceOn){
718  if(deviceCUDA->getMasterPe() == CkMyPe()){
719  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
720  }
721  }
722 
723 }
724 
725 /* events like ScriptTcl changes to global data require us to rebuild
726  some of the data structures used by some of the kernels. i.e., the
727  gridForceKernel needs to change when the grid, or grid related
728  data, is changed.
729 
730  Technically these may all be in use and updated at different
731  intervals, but that is a weird edge case that doesn't justify
732  adding a whole other sync path. If that happens, we'll be doing
733  this slightly too often. Such a hypothetical user has already
734  bought in to periodically sacrificing performance to radically
735  change the system being simulated at run time in several
736  different ways on different triggers, so as long as this overhead
737  is substantially better than a full restart, it probably doesn't
738  matter.
739  */
740 
741 void SequencerCUDA::updateDeviceKernels()
742 {
743  if (simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn){
744  // clean out the old and bring in the new if we're a master PE
745  if(deviceCUDA->getMasterPe() == CkMyPe()){
746  if(patchData->updateCounter.fetch_sub(1)>=1)
747  {
748  if(gridForceKernel!=NULL)
749  {
750  delete gridForceKernel;
751  gridForceKernel = new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
752  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
753  }
754  if(consForceKernel!=NULL)
755  {
756  delete consForceKernel;
757  consForceKernel = new ComputeConsForceCUDA(patchList, atomMapList,
758  mGpuOn);
759  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
760  }
761  }
762  }
763  }
764 }
765 
766 bool SequencerCUDA::reallocateArrays(int in_numAtomsHome, int in_numAtomsHomeAndProxy)
767 {
768  cudaCheck(cudaSetDevice(deviceID));
769  const float OVERALLOC = 1.5f;
770 
771  if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
772  return false;
773  }
774 
775  // Before deallocate the f_saved_nbond_* and f_saved_slow_*,
776  // check if they are nullptr and allocate them if they are null
777  bool realloc_gpu_saved_force = false;
778  if (d_f_saved_nbond_x != nullptr || d_f_saved_slow_x != nullptr) {
779  realloc_gpu_saved_force = true;
780  }
781 
782 
783  deallocateArrays();
784 
785  numAtomsHomeAndProxyAllocated = (int) ((float) in_numAtomsHomeAndProxy * OVERALLOC);
786  numAtomsHomeAllocated = (int) ((float) in_numAtomsHome * OVERALLOC);
787 
788  allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
789  allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
790  allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
791  allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
792  allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
793  allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
794  allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
795  allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
796  allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
797  allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
798  allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
799  allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
800  if (simParams->colvarsOn || simParams->tclForcesOn){
801  allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
802  allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
803  allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
804  }
805  allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
806  allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
807  allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);
808 
809  allocate_host<double>(&recipMass, numAtomsHomeAllocated);
810  allocate_host<double>(&vel_x, numAtomsHomeAllocated);
811  allocate_host<double>(&vel_y, numAtomsHomeAllocated);
812  allocate_host<double>(&vel_z, numAtomsHomeAllocated);
813  allocate_host<char3>(&transform, numAtomsHomeAllocated);
814  allocate_host<float>(&mass, numAtomsHomeAllocated);
815  if (simParams->alchOn) {
816  allocate_host<int>(&partition, numAtomsHomeAndProxyAllocated);
817  }
818  allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
819  allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
820  allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);
821 
822  // array buffers for pseudorandom normal distribution must be even length
823  // choose n to be the smallest even number >= numIntegrationAtoms
824  // guarantees that n is always > numIntegrationAtoms
825  int n = (numAtomsHomeAllocated + 1) & (~1);
826  allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
827  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
828  allocate_host<int>(&atomFixed, numAtomsHomeAllocated);
829  if (simParams->fixedAtomsOn) {
830  allocate_host<int>(&groupFixed, numAtomsHomeAllocated);
831  allocate_host(&fixedPosition_x, numAtomsHomeAllocated);
832  allocate_host(&fixedPosition_y, numAtomsHomeAllocated);
833  allocate_host(&fixedPosition_z, numAtomsHomeAllocated);
834  }
835  allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);
836 
837 
838  if (simParams->useDeviceMigration) {
839  allocate_host<int>(&idMig, numAtomsHomeAllocated);
840  allocate_host<int>(&vdwType, numAtomsHomeAllocated);
841  }
842 
843  allocate_device<double>(&d_f_raw, 9 * numAtomsHomeAndProxyAllocated); // Total number of force buffers
844  d_f_normal_x = &d_f_raw[numAtomsHomeAndProxyAllocated*0];
845  d_f_normal_y = &d_f_raw[numAtomsHomeAndProxyAllocated*1];
846  d_f_normal_z = &d_f_raw[numAtomsHomeAndProxyAllocated*2];
847  d_f_nbond_x = &d_f_raw[numAtomsHomeAndProxyAllocated*3];
848  d_f_nbond_y = &d_f_raw[numAtomsHomeAndProxyAllocated*4];
849  d_f_nbond_z = &d_f_raw[numAtomsHomeAndProxyAllocated*5];
850  d_f_slow_x = &d_f_raw[numAtomsHomeAndProxyAllocated*6];
851  d_f_slow_y = &d_f_raw[numAtomsHomeAndProxyAllocated*7];
852  d_f_slow_z = &d_f_raw[numAtomsHomeAndProxyAllocated*8];
853  allocate_device<double>(&d_pos_raw, 3 * numAtomsHomeAndProxyAllocated);
854  d_pos_x = &d_pos_raw[numAtomsHomeAndProxyAllocated*0];
855  d_pos_y = &d_pos_raw[numAtomsHomeAndProxyAllocated*1];
856  d_pos_z = &d_pos_raw[numAtomsHomeAndProxyAllocated*2];
857  allocate_device<float>(&d_charge, numAtomsHomeAndProxyAllocated);
858  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal){
859  allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
860  allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
861  allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
862  }
863  if (realloc_gpu_saved_force) {
864  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
865  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
866  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
867  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
868  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
869  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
870  }
871  allocate_device<int>(&d_sortOrder, numAtomsHomeAndProxyAllocated);
872  allocate_device<int>(&d_unsortOrder, numAtomsHomeAndProxyAllocated);
873 
874  // allocate memory for backup forces in MC barostat
875  if (simParams->monteCarloPressureOn) {
876  // Total number of backup force and positions buffers
877  allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
878  allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
879  d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
880  d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
881  d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
882  d_f_nbondMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
883  d_f_nbondMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
884  d_f_nbondMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
885  d_f_slowMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
886  d_f_slowMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
887  d_f_slowMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
888  d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
889  d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
890  d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];
891 
892  allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
893  allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
894  allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
895  allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
896  // we can use molecule_size + 1, rather than atom size
897  allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
898  }
899 
900  if (simParams->alchOn) {
901  allocate_device<int>(&d_partition, numAtomsHomeAndProxyAllocated);
902  }
903 
904  allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
905  d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
906  d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
907  d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
908  allocate_device<double>(&d_vel_x, numAtomsHomeAllocated);
909  allocate_device<double>(&d_vel_y, numAtomsHomeAllocated);
910  allocate_device<double>(&d_vel_z, numAtomsHomeAllocated);
911  allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
912  allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
913  allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
914  allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
915  allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
916  allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
917  allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
918  allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
919  allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
920  allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
921  allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
922  allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
923  allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
924  allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);
925 
926  allocate_device<float>(&d_mass, numAtomsHomeAllocated);
927  allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
928  allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
929  allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
930  allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
931  allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
932  allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
933  allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
934  allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
935  // Haochuan: I have to allocate atomFixed because buildRattleLists will use it regardless of fixedAtomsOn
936  allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
937  if (simParams->fixedAtomsOn) {
938  allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
939  allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
940  allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
941  allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
942  }
943 
944  if (simParams->useDeviceMigration) {
945  allocate_device<int>(&d_idMig, numAtomsHomeAndProxyAllocated);
946  allocate_device<int>(&d_vdwType, numAtomsHomeAndProxyAllocated);
947  allocate_device<FullAtom>(&d_atomdata_AoS, numAtomsHomeAllocated);
948  allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
949  allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
950  allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
951  }
952 
953  // These arrays will get allocated when total forces of the GPU global calculations are requested
954  d_f_saved_nbond_x = nullptr;
955  d_f_saved_nbond_y = nullptr;
956  d_f_saved_nbond_z = nullptr;
957  d_f_saved_slow_x = nullptr;
958  d_f_saved_slow_y = nullptr;
959  d_f_saved_slow_z = nullptr;
960 
961  // Memsets the d_pos arrays in order to reduce them afterwards;
962  memset(pos_x, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
963  memset(pos_y, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
964  memset(pos_z, 0, sizeof(double)*numAtomsHomeAndProxyAllocated);
965  cudaCheck(cudaMemset(d_pos_x, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
966  cudaCheck(cudaMemset(d_pos_y, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
967  cudaCheck(cudaMemset(d_pos_z, 0 , sizeof(double)*numAtomsHomeAndProxyAllocated));
968  cudaCheck(cudaMemset(d_vel_x, 0 , sizeof(double)*numAtomsHomeAllocated));
969  cudaCheck(cudaMemset(d_vel_y, 0 , sizeof(double)*numAtomsHomeAllocated));
970  cudaCheck(cudaMemset(d_vel_z, 0 , sizeof(double)*numAtomsHomeAllocated));
971 
972  cudaCheck(cudaMemset(d_posNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
973  cudaCheck(cudaMemset(d_posNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
974  cudaCheck(cudaMemset(d_posNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
975  cudaCheck(cudaMemset(d_velNew_x, 0 , sizeof(double)*numAtomsHomeAllocated));
976  cudaCheck(cudaMemset(d_velNew_y, 0 , sizeof(double)*numAtomsHomeAllocated));
977  cudaCheck(cudaMemset(d_velNew_z, 0 , sizeof(double)*numAtomsHomeAllocated));
978 
979  return true;
980 }
981 
982 void SequencerCUDA::reallocateMigrationDestination() {
983  if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
984  allocate_device<int4>(&d_migrationDestination, numAtomsHomeAndProxyAllocated);
985 }
986 
987 void SequencerCUDA::deallocateArrays() {
988  if (numAtomsHomeAndProxyAllocated != 0) {
989  cudaCheck(cudaSetDevice(deviceID));
990 
991  deallocate_host<double>(&f_normal_x);
992  deallocate_host<double>(&f_normal_y);
993  deallocate_host<double>(&f_normal_z);
994  if (simParams->colvarsOn || simParams->tclForcesOn){
995  deallocate_host<double>(&f_global_x);
996  deallocate_host<double>(&f_global_y);
997  deallocate_host<double>(&f_global_z);
998  }
999  deallocate_host<double>(&f_nbond_x);
1000  deallocate_host<double>(&f_nbond_y);
1001  deallocate_host<double>(&f_nbond_z);
1002  deallocate_host<double>(&f_slow_x);
1003  deallocate_host<double>(&f_slow_y);
1004  deallocate_host<double>(&f_slow_z);
1005  deallocate_host<double>(&pos_x);
1006  deallocate_host<double>(&pos_y);
1007  deallocate_host<double>(&pos_z);
1008  deallocate_host<float>(&charge);
1009  deallocate_host<int>(&sortOrder);
1010  deallocate_host<int>(&unsortOrder);
1011  deallocate_host<double>(&recipMass);
1012  deallocate_host<double>(&vel_x);
1013  deallocate_host<double>(&vel_y);
1014  deallocate_host<double>(&vel_z);
1015  deallocate_host<char3>(&transform);
1016  deallocate_host<float>(&mass);
1017  if (simParams->alchOn) {
1018  deallocate_host<int>(&partition);
1019  }
1020  deallocate_host<float>(&langevinParam);
1021  deallocate_host<float>(&langScalVelBBK2);
1022  deallocate_host<float>(&langScalRandBBK2);
1023 
1024  deallocate_host<int>(&hydrogenGroupSize);
1025  deallocate_host<int>(&atomFixed);
1026  if (simParams->fixedAtomsOn) {
1027  deallocate_host<int>(&groupFixed);
1028  deallocate_host<double>(&fixedPosition_x);
1029  deallocate_host<double>(&fixedPosition_y);
1030  deallocate_host<double>(&fixedPosition_z);
1031  }
1032 
1033  deallocate_host<float>(&rigidBondLength);
1034 
1035  deallocate_device<double>(&d_f_raw);
1036  deallocate_device<double>(&d_pos_raw);
1037  deallocate_device<float>(&d_charge);
1038  deallocate_device<int>(&d_sortOrder);
1039  deallocate_device<int>(&d_unsortOrder);
1040  if (simParams->alchOn) {
1041  deallocate_device<int>(&d_partition);
1042  }
1043 
1044  deallocate_device<double>(&d_posNew_raw);
1045 
1046  if (simParams->monteCarloPressureOn) {
1047  deallocate_device<double>(&d_f_rawMC);
1048  deallocate_device<double>(&d_pos_rawMC);
1049 
1050  deallocate_host<int>(&id);
1051  deallocate_device<int>(&d_id);
1052  deallocate_device<int>(&d_idOrder);
1053  deallocate_device<int>(&d_moleculeAtom);
1054  deallocate_device<int>(&d_moleculeStartIndex);
1055  }
1056 
1057  if (simParams->useDeviceMigration) {
1058  deallocate_host<int>(&idMig);
1059  deallocate_host<int>(&vdwType);
1060  deallocate_device<int>(&d_idMig);
1061  deallocate_device<int>(&d_vdwType);
1062  deallocate_device<FullAtom>(&d_atomdata_AoS);
1063  deallocate_device<int>(&d_migrationGroupSize);
1064  deallocate_device<int>(&d_migrationGroupIndex);
1065  deallocate_device<int>(&d_sortIndex);
1066  }
1067  if (simParams->colvarsOn || simParams->tclForcesOn || simParams->useCudaGlobal){
1068  deallocate_device<double>(&d_f_global_x);
1069  deallocate_device<double>(&d_f_global_y);
1070  deallocate_device<double>(&d_f_global_z);
1071  }
1072  deallocate_device<double>(&d_vel_x);
1073  deallocate_device<double>(&d_vel_y);
1074  deallocate_device<double>(&d_vel_z);
1075  deallocate_device<double>(&d_recipMass);
1076  deallocate_device<char3>(&d_transform);
1077  deallocate_device<double>(&d_velNew_x);
1078  deallocate_device<double>(&d_velNew_y);
1079  deallocate_device<double>(&d_velNew_z);
1080  deallocate_device<double>(&d_posSave_x);
1081  deallocate_device<double>(&d_posSave_y);
1082  deallocate_device<double>(&d_posSave_z);
1083  deallocate_device<double>(&d_rcm_x);
1084  deallocate_device<double>(&d_rcm_y);
1085  deallocate_device<double>(&d_rcm_z);
1086  deallocate_device<double>(&d_vcm_x);
1087  deallocate_device<double>(&d_vcm_y);
1088  deallocate_device<double>(&d_vcm_z);
1089  deallocate_device<float>(&d_mass);
1090  deallocate_device<float>(&d_langevinParam);
1091  deallocate_device<float>(&d_langScalVelBBK2);
1092  deallocate_device<float>(&d_langScalRandBBK2);
1093  deallocate_device<float>(&d_gaussrand_x);
1094  deallocate_device<float>(&d_gaussrand_y);
1095  deallocate_device<float>(&d_gaussrand_z);
1096  deallocate_device<int>(&d_hydrogenGroupSize);
1097  deallocate_device<float>(&d_rigidBondLength);
1098  deallocate_device<int>(&d_atomFixed);
1099  if (simParams->fixedAtomsOn) {
1100  deallocate_device<int>(&d_groupFixed);
1101  deallocate_device<double>(&d_fixedPosition_x);
1102  deallocate_device<double>(&d_fixedPosition_y);
1103  deallocate_device<double>(&d_fixedPosition_z);
1104  }
1105  deallocate_device<double>(&d_f_saved_nbond_x);
1106  deallocate_device<double>(&d_f_saved_nbond_y);
1107  deallocate_device<double>(&d_f_saved_nbond_z);
1108  deallocate_device<double>(&d_f_saved_slow_x);
1109  deallocate_device<double>(&d_f_saved_slow_y);
1110  deallocate_device<double>(&d_f_saved_slow_z);
1111  }
1112 }
1113 
1114 void SequencerCUDA::deallocateStaticArrays() {
1115  cudaCheck(cudaSetDevice(deviceID));
1116 
1117  deallocate_host<cudaTensor>(&extVirial);
1118  deallocate_host<double3>(&extForce);
1119  deallocate_host<double>(&extEnergy);
1120  deallocate_host<unsigned int>(&h_marginViolations);
1121  deallocate_host<unsigned int>(&h_periodicCellSmall);
1122 
1123  // XXX TODO: Deallocate the additional arrays we added for shmem version
1124  deallocate_host<double3>(&awayDists);
1125  deallocate_host<double3>(&patchMin);
1126  deallocate_host<double3>(&patchMax);
1127  deallocate_host<CudaAtom*>(&cudaAtomLists);
1128  deallocate_host<double3>(&patchCenter);
1129  deallocate_host<int>(&globalToLocalID);
1130  deallocate_host<int>(&patchToDeviceMap);
1131  deallocate_host<Lattice>(&pairlist_lattices);
1132  deallocate_host<double>(&patchMaxAtomMovement);
1133  deallocate_host<double>(&patchNewTolerance);
1134  deallocate_host<CudaMInfo>(&mInfo);
1135  deallocate_host<bool*>(&h_patchRecordHasForces);
1136 
1137  deallocate_host<cudaTensor>(&lpVirialNormal);
1138  deallocate_host<cudaTensor>(&lpVirialNbond);
1139  deallocate_host<cudaTensor>(&lpVirialSlow);
1140 
1141  deallocate_device<double3>(&d_awayDists);
1142  deallocate_device<double3>(&d_patchMin);
1143  deallocate_device<double3>(&d_patchMax);
1144  deallocate_device<int>(&d_globalToLocalID);
1145  deallocate_device<int>(&d_patchToDeviceMap);
1146  deallocate_device<int>(&d_sortOrder);
1147  deallocate_device<int>(&d_unsortOrder);
1148  deallocate_device<double3>(&d_patchCenter);
1149  deallocate_device<Lattice>(&d_lattices);
1150  deallocate_device<Lattice>(&d_pairlist_lattices);
1151  deallocate_device<double>(&d_patchMaxAtomMovement);
1152  deallocate_device<double>(&d_patchNewTolerance);
1153  deallocate_device<CudaMInfo>(&d_mInfo);
1154 
1155  deallocate_device<int>(&d_killme);
1156  deallocate_device<char>(&d_barrierFlag);
1157  deallocate_device<unsigned int>(&d_tbcatomic);
1158  deallocate_device<BigReal>(&d_kineticEnergy);
1159  deallocate_device<BigReal>(&d_intKineticEnergy);
1160  deallocate_device<BigReal>(&d_momentum_x);
1161  deallocate_device<BigReal>(&d_momentum_y);
1162  deallocate_device<BigReal>(&d_momentum_z);
1163  deallocate_device<BigReal>(&d_angularMomentum_x);
1164  deallocate_device<BigReal>(&d_angularMomentum_y);
1165  deallocate_device<BigReal>(&d_angularMomentum_z);
1166  deallocate_device<cudaTensor>(&d_virial);
1167  deallocate_device<cudaTensor>(&d_intVirialNormal);
1168  deallocate_device<cudaTensor>(&d_intVirialNbond);
1169  deallocate_device<cudaTensor>(&d_intVirialSlow);
1170  deallocate_device<cudaTensor>(&d_lpVirialNormal);
1171  deallocate_device<cudaTensor>(&d_lpVirialNbond);
1172  deallocate_device<cudaTensor>(&d_lpVirialSlow);
1173  deallocate_device<cudaTensor>(&d_rigidVirial);
1174  deallocate_device<cudaTensor>(&d_extVirial);
1175  deallocate_device<double3>(&d_extForce);
1176  deallocate_device<double>(&d_extEnergy);
1177  deallocate_device<SettleParameters>(&sp);
1178  deallocate_device<unsigned int>(&deviceQueue);
1179  deallocate_device<double*>(&d_peer_pos_x);
1180  deallocate_device<double*>(&d_peer_pos_y);
1181  deallocate_device<double*>(&d_peer_pos_z);
1182  deallocate_device<float*>(&d_peer_charge);
1183  deallocate_device<int*>(&d_peer_partition);
1184  deallocate_device<double*>(&d_peer_vel_x);
1185  deallocate_device<double*>(&d_peer_vel_y);
1186  deallocate_device<double*>(&d_peer_vel_z);
1187  deallocate_device<double*>(&d_peer_fb_x);
1188  deallocate_device<double*>(&d_peer_fb_y);
1189  deallocate_device<double*>(&d_peer_fb_z);
1190  deallocate_device<double*>(&d_peer_fn_x);
1191  deallocate_device<double*>(&d_peer_fn_y);
1192  deallocate_device<double*>(&d_peer_fn_z);
1193  deallocate_device<double*>(&d_peer_fs_x);
1194  deallocate_device<double*>(&d_peer_fs_y);
1195  deallocate_device<double*>(&d_peer_fs_z);
1196 
1197  deallocate_device<bool*>(&d_patchRecordHasForces);
1198 
1199  deallocate_device(&d_fixVirialNormal);
1200  deallocate_device(&d_fixVirialNbond);
1201  deallocate_device(&d_fixVirialSlow);
1202  deallocate_device(&d_fixForceNormal);
1203  deallocate_device(&d_fixForceNbond);
1204  deallocate_device(&d_fixForceSlow);
1205 
1206  if (simParams->useDeviceMigration) {
1207  deallocate_device<FullAtom>(&d_atomdata_AoS_in);
1208  deallocate_device<int>(&d_sortSoluteIndex);
1209  if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1210  }
1211 
1212  deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
1213 }
1214 
1215 void SequencerCUDA::copyMigrationInfo(HomePatch *p, int patchIndex){
1216  CudaMInfo &m = mInfo[patchIndex];
1217  if (!p->patchMapRead) p->readPatchMap();
1218  for(int x = 0; x < 3; x++){
1219  for(int y = 0; y < 3; y++){
1220  for(int z = 0; z < 3; z++){
1221  // copies migration info over
1222  MigrationInfo *hm = p->mInfo[x][y][z];
1223  if(hm != NULL) m.destPatchID[x][y][z] = hm->destPatchID;
1224  else m.destPatchID[x][y][z] = -1; // let's flag this as -1 for now
1225  }
1226  }
1227  }
1228  if (simParams->useDeviceMigration) {
1229  m.destPatchID[1][1][1] = p->getPatchID();
1230  }
1231 }
1232 
1233 void SequencerCUDA::assembleOrderedPatchList(){
1234  // Assembles the patches on each Pe sharing a device into a device-ordered list
1235  patchList.clear();
1236 
1237  // Handle our own patches
1238  for (int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
1239  HomePatch *p = patchData->devData[deviceIndex].patches[i];
1240  patchList.push_back(p);
1241  }
1242 
1243 
1244  // Do we really need this?
1245 #if 1
1246  patchListHomeAndProxy.clear();
1247  // Set up list of device patches. We need the home patches for everyone
1248  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1249  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1250  const int patchID = localPatches[i].patchID;
1251 
1252 
1253  for(int d = 0; d < CkNumPes(); d++){
1254  PatchMap* pm = PatchMap::ObjectOnPe(d);
1255  HomePatch *patch = pm->homePatch(patchID);
1256  if(patch != NULL) {
1257  patchListHomeAndProxy.push_back(patch);
1258  }
1259  }
1260  }
1261 
1262 #endif
1263 }
1264 
1272 void SequencerCUDA::copyAoSDataToHost() {
1273  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1274  patchData = cpdata.ckLocalBranch();
1275 
1276  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1277  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1278 
1279  CUDAMigrationKernel->update_AoS(
1280  numPatchesHome,
1281  patchData->devData[deviceIndex].d_localPatches,
1282  (FullAtom*) d_atomdata_AoS,
1283  d_vel_x, d_vel_y, d_vel_z,
1284  d_pos_x, d_pos_y, d_pos_z,
1285  stream
1286  );
1287 
1288  for (int i = 0; i < integrationPatches.size(); i++) {
1289  const int numAtoms = localPatches[i].numAtoms;
1290  const int offset = localPatches[i].bufferOffset;
1291  HomePatch *patch = integrationPatches[i];
1292  patch->updateAtomCount(numAtoms, false);
1293  patch->updateAtomBuffers();
1294  FullAtomList& h_atomdata = patch->getAtomList();
1295  copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (FullAtom*)h_atomdata.begin(), numAtoms, stream);
1296  }
1297  cudaCheck(cudaStreamSynchronize(stream));
1298 }
1299 
1300 
1307 void SequencerCUDA::migrationLocalInit() {
1308  CUDAMigrationKernel->computeMigrationGroupIndex(
1309  numPatchesHome,
1310  patchData->devData[deviceIndex].d_localPatches,
1311  d_migrationGroupSize,
1312  d_migrationGroupIndex,
1313  stream
1314  );
1315 
1316  CUDAMigrationKernel->update_AoS(
1317  numPatchesHome,
1318  patchData->devData[deviceIndex].d_localPatches,
1319  (FullAtom*) d_atomdata_AoS,
1320  d_vel_x, d_vel_y, d_vel_z,
1321  d_pos_x, d_pos_y, d_pos_z,
1322  stream
1323  );
1324 
1325  CUDAMigrationKernel->computeMigrationDestination(
1326  numPatchesHome,
1327  patchData->devData[deviceIndex].d_localPatches,
1328  myLattice,
1329  d_mInfo,
1330  d_patchToDeviceMap,
1331  d_globalToLocalID,
1332  d_patchMin,
1333  d_patchMax,
1334  d_hydrogenGroupSize,
1335  d_migrationGroupSize,
1336  d_migrationGroupIndex,
1337  d_pos_x, d_pos_y, d_pos_z,
1338  d_migrationDestination,
1339  stream
1340  );
1341 
1342  CUDAMigrationKernel->performLocalMigration(
1343  numPatchesHome,
1344  patchData->devData[deviceIndex].d_localPatches,
1345  (FullAtom*) d_atomdata_AoS,
1346  (FullAtom*) d_atomdata_AoS_in,
1347  d_migrationDestination,
1348  stream
1349  );
1350 
1351  cudaCheck(cudaStreamSynchronize(stream));
1352 }
1353 
1358 void SequencerCUDA::migrationPerform() {
1359  CUDAMigrationKernel->performMigration(
1360  numPatchesHome,
1361  patchData->devData[deviceIndex].d_localPatches,
1362  d_peer_record,
1363  (FullAtom*) d_atomdata_AoS,
1364  d_peer_atomdata,
1365  d_migrationGroupSize,
1366  d_migrationGroupIndex,
1367  d_migrationDestination,
1368  stream
1369  );
1370  cudaCheck(cudaStreamSynchronize(stream));
1371 }
1372 
1373 
1374 void SequencerCUDA::migrationUpdateAtomCounts() {
1375  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1376  patchData = cpdata.ckLocalBranch();
1377 
1378  CUDAMigrationKernel->updateLocalRecords(
1379  numPatchesHome,
1380  patchData->devData[deviceIndex].d_localPatches,
1381  d_peer_record,
1382  patchData->devData[deviceIndex].d_peerPatches,
1383  stream
1384  );
1385 
1386  cudaCheck(cudaStreamSynchronize(stream));
1387 }
1388 
1389 void SequencerCUDA::migrationUpdateAtomOffsets() {
1390  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1391  patchData = cpdata.ckLocalBranch();
1392 
1393  CUDAMigrationKernel->updateLocalRecordsOffset(
1394  numPatchesHomeAndProxy,
1395  patchData->devData[deviceIndex].d_localPatches,
1396  stream
1397  );
1398 
1399  cudaCheck(cudaStreamSynchronize(stream));
1400 }
1401 
1402 void SequencerCUDA::migrationUpdateRemoteOffsets() {
1403  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1404  patchData = cpdata.ckLocalBranch();
1405 
1406  CUDAMigrationKernel->updatePeerRecords(
1407  numPatchesHomeAndProxy,
1408  patchData->devData[deviceIndex].d_localPatches,
1409  d_peer_record,
1410  patchData->devData[deviceIndex].d_peerPatches,
1411  stream
1412  );
1413 
1414  cudaCheck(cudaStreamSynchronize(stream));
1415 }
1416 
1417 void SequencerCUDA::migrationUpdateProxyDestination() {
1418  if (mGpuOn) {
1419  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1420  patchData = cpdata.ckLocalBranch();
1421 
1422  // This is implemented as a put instead of a get to avoid the need
1423  // for a node synchronization.
1424  CUDAMigrationKernel->copyMigrationDestinationToProxies(
1425  deviceIndex,
1426  numPatchesHome,
1427  numPatchesHomeAndProxy,
1428  patchData->devData[deviceIndex].d_localPatches,
1429  patchData->devData[deviceIndex].d_peerPatches,
1430  d_peer_migrationDestination,
1431  stream
1432  );
1433  }
1434 }
1435 
1436 void SequencerCUDA::copyPatchDataToHost() {
1437  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1438  patchData = cpdata.ckLocalBranch();
1439  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1440 
1441  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1442  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1443 
1444  copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
1445  cudaCheck(cudaStreamSynchronize(stream));
1446 
1447 
1448  // Update Atom Counts
1449  for (int i = 0; i < numPatchesHome; i++) {
1450  HomePatch* hp = integrationPatches[i];
1451  hp->updateAtomCount(localPatches[i].numAtoms, false);
1452  }
1453  cudaCheck(cudaStreamSynchronize(stream));
1454 }
1455 
1456 // This code path is used only by device migration
1457 void SequencerCUDA::copyAtomDataToDeviceAoS() {
1458  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1459  patchData = cpdata.ckLocalBranch();
1460 
1461  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1462  const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1463  std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1464 
1465 
1466  for (int i = 0; i < integrationPatches.size(); i++) {
1467  const int numAtoms = localPatches[i].numAtoms;
1468  if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
1469  iout << iERROR << "The number of atoms in patch " << i << " is "
1470  << numAtoms << ", greater than the limit for GPU atom migration ("
1471  << MigrationCUDAKernel::kMaxAtomsPerPatch << ").\n" << endi;
1472  NAMD_bug("NAMD has stopped simulating due to the error above, "
1473  "but you could disable GPUAtomMigration and try again.\n");
1474  }
1475  const int offset = localPatches[i].bufferOffset;
1476  HomePatch *patch = integrationPatches[i];
1477  FullAtomList& h_atomdata = patch->getAtomList();
1478  copy_HtoD<FullAtom>((FullAtom*)h_atomdata.begin(), d_atomdata_AoS_in + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
1479  }
1480  cudaCheck(cudaStreamSynchronize(stream));
1481 }
1482 
1483 /*
1484  * Aggregates and copys various data from the host to device
1485  *
1486  * Some data is only copied for home patches, while other data is copied for
1487  * home and proxy patches
1488  *
1489  */
1490 void SequencerCUDA::copyAtomDataToDevice(bool copyForces, int maxForceNumber) {
1491 
1492  AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass, double, stream);
1493  if(copyForces){
1494  switch (maxForceNumber) {
1495  case 2:
1496  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_x, double, stream);
1497  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_y, double, stream);
1498  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_z, double, stream);
1499  case 1:
1500  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_x, double, stream);
1501  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_y, double, stream);
1502  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_z, double, stream);
1503  case 0:
1504  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_x, double, stream);
1505  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_y, double, stream);
1506  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_z, double, stream);
1507  }
1508  }
1509 
1510  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_x, double, stream);
1511  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_y, double, stream);
1512  AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_z, double, stream);
1513  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_x, double, stream);
1514  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_y, double, stream);
1515  AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_z, double, stream);
1516  AGGREGATE_HOME_ATOMS_TO_DEVICE(mass, float, stream);
1517  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(charge, float, stream);
1518  if (simParams->alchOn) {
1519  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(partition, int, stream);
1520  }
1521 
1522  if (simParams->langevinOn) {
1523  AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam, float, stream);
1524  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2, float, stream);
1525  AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2, float, stream);
1526  }
1527 
1528  AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize, int, stream);
1529  AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed, int, stream);
1530  if (simParams->fixedAtomsOn) {
1531  AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed, int, stream);
1532  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x, double, stream);
1533  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y, double, stream);
1534  AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z, double, stream);
1535  }
1536  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(sortOrder, int, stream);
1537  AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(unsortOrder, int, stream);
1538  AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength, float, stream);
1539 
1540  if (simParams->monteCarloPressureOn) {
1541  AGGREGATE_HOME_ATOMS_TO_DEVICE(id, int, stream);
1542  //set up initial mapping for global index to local array index
1543  CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
1544  }
1545 }
1546 
1547 
1548 void SequencerCUDA::migrationLocalPost(int startup) {
1549  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1550  patchData = cpdata.ckLocalBranch();
1551 
1552  if (simParams->useDeviceMigration) {
1553  if (!startup) {
1554  CUDAMigrationKernel->transformMigratedPositions(
1555  numPatchesHome,
1556  patchData->devData[deviceIndex].d_localPatches,
1557  d_patchCenter,
1558  (FullAtom*) d_atomdata_AoS_in,
1559  myLattice,
1560  stream
1561  );
1562  }
1563 
1564  // sort solvent/solute data
1565  CUDAMigrationKernel->sortSolventAtoms(
1566  numPatchesHome,
1567  patchData->devData[deviceIndex].d_localPatches,
1568  (FullAtom*) d_atomdata_AoS_in,
1569  (FullAtom*) d_atomdata_AoS,
1570  d_sortSoluteIndex,
1571  stream
1572  );
1573 
1574  double dt = 1.0;
1575  double kbT = 1.0;
1576  double tempFactor = 1.0;
1577  if (simParams->langevinOn) {
1578  dt = simParams->dt * 0.001; // convert timestep to ps
1579  kbT = BOLTZMANN * simParams->langevinTemp;
1580  int lesReduceTemp = (simParams->lesOn && simParams->lesReduceTemp);
1581  tempFactor = (lesReduceTemp ? 1. / simParams->lesFactor : 1);
1582  }
1583  CUDAMigrationKernel->copy_AoS_to_SoA(
1584  numPatchesHome, simParams->alchOn,
1585  simParams->langevinOn, dt, kbT, tempFactor,
1586  patchData->devData[deviceIndex].d_localPatches,
1587  (FullAtom*) d_atomdata_AoS,
1588  d_recipMass,
1589  d_vel_x, d_vel_y, d_vel_z,
1590  d_pos_x, d_pos_y, d_pos_z,
1591  d_mass, d_charge,
1592  d_idMig, d_vdwType,
1593  d_hydrogenGroupSize, d_migrationGroupSize,
1594  d_atomFixed,
1595  d_rigidBondLength,
1596  d_transform,
1597  d_partition,
1598  d_langevinParam,
1599  d_langScalVelBBK2,
1600  d_langScalRandBBK2,
1601  stream
1602  );
1603  }
1604 
1605  // Other migration post processing steps
1606  if (simParams->useDeviceMigration) {
1607  if (simParams->monteCarloPressureOn) {
1608  CUDASequencerKernel->SetAtomIndexOrder(d_idMig, d_idOrder, numAtomsHome, stream);
1609  }
1610  }
1611 
1612  // JM: Saving position to these doubles for pairListCheck
1613  copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
1614  copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
1615  copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
1616 
1617  // JM NOTE: We need to save the lattice at the beggining of the cycle
1618  // in order to use it in SequencerCUDAKernel::pairlistCheck();
1619  myLatticeOld = myLattice;
1620 
1621  // JM NOTE: Recalculates the centers of mass since we have new positions
1622  CUDASequencerKernel->centerOfMass(
1623  d_pos_x, d_pos_y, d_pos_z,
1624  d_rcm_x, d_rcm_y, d_rcm_z,
1625  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1626  CUDASequencerKernel->centerOfMass(
1627  d_vel_x, d_vel_y, d_vel_z,
1628  d_vcm_x, d_vcm_y, d_vcm_z,
1629  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1630 
1631  cudaCheck(cudaStreamSynchronize(stream));
1632 }
1633 
1634 void SequencerCUDA::migrationUpdateAdvancedFeatures(const int startup) {
1635  if(simParams->eFieldOn || simParams->SMDOn || simParams->groupRestraintsOn ||
1636  simParams->monteCarloPressureOn || simParams->mgridforceOn || simParams->gridforceOn || simParams->consForceOn ){
1637 
1638  // Handcopies the transform field in a char* SOA
1639  // JM NOTE: We're copying ints into char because "transform" is an int
1640  // field in PatchDataSOA but a signed char in FullAtom
1641  size_t offset = 0;
1642  for (int i = 0; i < numPatchesHome; i++) {
1643  PatchDataSOA& current = patchList[i]->patchDataSOA;
1644  const int numPatchAtoms = current.numAtoms;
1645  // memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type));
1646  for(int j = 0; j < numPatchAtoms; j++){
1647  transform[offset + j].x = current.transform_i[j];
1648  transform[offset + j].y = current.transform_j[j];
1649  transform[offset + j].z = current.transform_k[j];
1650  }
1651  offset += numPatchAtoms;
1652  }
1653  copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
1654  }
1655 
1656  if (!startup) {
1658  lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
1659  }
1660  if(simParams->constraintsOn) {
1661  restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1662  }
1663  if(simParams->SMDOn) {
1664  SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1665  }
1666  if(simParams->groupRestraintsOn) {
1667  groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1668  }
1669  if(simParams->mgridforceOn || simParams->gridforceOn){
1670 
1671  gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
1672  }
1673  if(simParams->consForceOn) {
1674  consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1675  }
1676 
1677  }
1678 }
1679 
1680 void SequencerCUDA::migrationUpdateDestination() {
1681  CUDAMigrationKernel->updateMigrationDestination(
1682  numAtomsHomePrev,
1683  d_migrationDestination,
1684  d_peer_sortSoluteIndex,
1685  stream
1686  );
1687 }
1688 
1689 bool SequencerCUDA::copyPatchData(
1690  const bool copyIn,
1691  const bool startup
1692 ) {
1693  bool reallocated = false;
1694  if (copyIn) {
1695  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1696  patchData = cpdata.ckLocalBranch();
1697 
1698  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1699 
1700  std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1701  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1702 
1703  if (startup) {
1704  // numPatchesHomeAndProxy is set when the patch data is constructed
1705  numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1706  numPatchesHome = homePatches.size();
1707  patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;
1708 
1709  if (simParams->useDeviceMigration) {
1710  // Padded data structures which will not be reallocated
1711  allocate_device<FullAtom>(&d_atomdata_AoS_in, ((int64_t) numPatchesHome) * MigrationCUDAKernel::kMaxAtomsPerPatch);
1712  allocate_device<int>(&d_sortSoluteIndex, numPatchesHome * MigrationCUDAKernel::kMaxAtomsPerPatch);
1713  d_migrationDestination = NULL;
1714  }
1715 #if defined(NAMD_HIP)
1716  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_localPatches,
1717  sizeof(CudaLocalRecord)*numPatchesHomeAndProxy,
1718  hipDeviceMallocFinegrained);
1719  hipExtMallocWithFlags((void**)&patchData->devData[deviceIndex].d_peerPatches,
1720  sizeof(CudaLocalRecord)*peerPatches.size(),
1721  hipDeviceMallocFinegrained);
1722 #else
1723  allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
1724  allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
1725 #endif
1726  if (simParams->useDeviceMigration) {
1727  CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
1728  }
1729 
1730  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1731  numPatchesHomeAndProxy, stream);
1732  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1733  peerPatches.size(), stream);
1734  if(true || mGpuOn) {
1735  this->assembleOrderedPatchList();
1736  }
1737  this->copySettleParameter();
1738 
1739  for (int i = 0; i < numPatchesHome; i++) {
1740  HomePatch *patch = homePatches[i];
1741  this->copyMigrationInfo(patch, i);
1742  patchNewTolerance[i] = 0.5 * ( simParams->pairlistDist - simParams->cutoff);
1743 
1744  globalToLocalID[patch->getPatchID()] = i;
1745  patchToDeviceMap[patch->getPatchID()] = deviceIndex;
1746  }
1747  copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
1748  copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);
1749 
1750  // Need the globalToLocalID and patchToDeviceMap data structures to be system wide for migration
1751  // They are also used in tuple migration, so we add them to patchData, so they can be easily
1752  // accessed elsewhere
1753  for (int i = 0; i < deviceCUDA->getNumDevice(); i++) {
1754  if (i == deviceIndex) continue;
1755  std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
1756  for (int j = 0; j < otherPatches.size(); j++) {
1757  HomePatch *patch = otherPatches[j];
1758  globalToLocalID[patch->getPatchID()] = j;
1759  patchToDeviceMap[patch->getPatchID()] = i;
1760  }
1761  }
1762  copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
1763  copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
1764  patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
1765  patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;
1766 
1767  // Allocate more data
1768  allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
1769  }
1770 
1771  for (int i = 0; i < numPatchesHomeAndProxy; i++) {
1772  HomePatch *patch = patchListHomeAndProxy[i];
1773  awayDists[i].x = patch->aAwayDist;
1774  awayDists[i].y = patch->bAwayDist;
1775  awayDists[i].z = patch->cAwayDist;
1776  COPY_CUDAVECTOR(patch->center, patchCenter[i]);
1777  COPY_CUDAVECTOR(patch->getMin(), patchMin[i]);
1778  COPY_CUDAVECTOR(patch->getMax(), patchMax[i]);
1779  }
1780 
1781  copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
1782  copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
1783  copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
1784  copy_HtoD<double3>(patchCenter, d_patchCenter, numPatchesHomeAndProxy, stream);
1785 
1786  const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
1787  localPatches[numPatchesHomeAndProxy-1].numAtoms;
1788 
1789  const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
1790  localPatches[numPatchesHome-1].numAtoms;
1791 
1792  reallocated = reallocateArrays(homeAtomCount, totalAtomCount);
1793 
1794 
1795  numAtomsHomePrev = numAtomsHome;
1796  numAtomsHomeAndProxy = totalAtomCount;
1797  numAtomsHome = homeAtomCount;
1798 
1799  patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;
1800 
1801  if (!startup) {
1802  copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1803  numPatchesHomeAndProxy, stream);
1804  copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1805  peerPatches.size(), stream);
1806  }
1807  if (startup) {
1808  if (simParams->monteCarloPressureOn) {
1809  //Only at startup, copy the molecule's index info to GPU
1810  Molecule *molecule = Node::Object()->molecule;
1811  copy_HtoD<int>(molecule->moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
1812  copy_HtoD<int>(molecule->moleculeStartIndex, d_moleculeStartIndex, molecule->numMolecules + 1, stream);
1813  }
1814  }
1815  }
1816  return reallocated;
1817 }
1818 
1819 void SequencerCUDA::copyDataToPeers(
1820  const bool copyIn
1821 ) {
1822  if (!copyIn) return;
1823  // Positions will be copied by the kernel
1824  // Forces don't need to be copied
1825  // Atom data needed to be copied: sortOrder, unsortOrder
1826 
1827  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1828  patchData = cpdata.ckLocalBranch();
1829  if (mGpuOn) {
1830  CUDAMigrationKernel->copyDataToProxies(
1831  deviceIndex,
1832  numPatchesHome,
1833  numPatchesHomeAndProxy,
1834  patchData->devData[deviceIndex].d_localPatches,
1835  d_peer_id,
1836  d_peer_vdwType,
1837  d_peer_sortOrder,
1838  d_peer_unsortOrder,
1839  d_peer_charge,
1840  d_peer_partition,
1841  d_peer_patchCenter,
1842  simParams->alchOn,
1843  stream
1844  );
1845  }
1846  cudaCheck(cudaStreamSynchronize(stream));
1847 }
1848 
1849 void SequencerCUDA::migrationSortAtomsNonbonded() {
1850  CUDAMigrationKernel->sortAtoms(
1851  numPatchesHome, numAtomsHome,
1852  patchData->devData[deviceIndex].d_localPatches,
1853  patchMin, patchMax,
1854  d_pos_x, d_pos_y, d_pos_z,
1855  d_sortOrder,
1856  d_unsortOrder,
1857  d_sortIndex,
1858  stream
1859  );
1860 }
1861 
1862 void SequencerCUDA::maximumMove(
1863  const double maxvel2,
1864  const int numAtoms)
1865 {
1866  CUDASequencerKernel->maximumMove(
1867  maxvel2, d_vel_x, d_vel_y, d_vel_z,
1868  killme, numAtoms, stream);
1869 }
1870 
1871 void SequencerCUDA::submitHalf(
1872  NodeReduction *reduction,
1873  int numAtoms, int part, const bool doEnergy)
1874 {
1875  //BigReal kineticEnergy;
1876  Tensor reduction_virial;
1877  //cudaTensor h_virial;
1878  //BigReal intKineticEnergy;
1879  Tensor reduction_intVirialNormal;
1880  //cudaTensor h_intVirialNormal;
1881  int hgs;
1882 
1883  if(doEnergy){
1884 #if 0
1885  cudaCheck(cudaEventRecord(eventStart,stream));
1886 #endif
1887  CUDASequencerKernel->submitHalf(
1888  simParams->fixedAtomsOn,
1889  d_vel_x, d_vel_y, d_vel_z,
1890  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
1891  d_kineticEnergy, d_intKineticEnergy,
1892  d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
1893  virial_half, intVirialNormal_half,
1894  d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
1895 #if 0
1896  cudaCheck(cudaEventRecord(eventStop, stream));
1897  cudaCheck(cudaEventSynchronize(eventStop));
1898  cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
1899  fprintf(stderr, "submitHalf total elapsed time: %f\n", t_submitHalf);
1900  t_submitReductions2 = 0;
1901 #endif
1902  }
1903 }
1904 
1905 void SequencerCUDA::submitReductions(
1906  NodeReduction *reduction,
1907  BigReal origin_x,
1908  BigReal origin_y,
1909  BigReal origin_z,
1910  int marginViolations,
1911  int doEnergy,
1912  int doMomentum,
1913  int numAtomsReduction,
1914  int maxForceNumber)
1915 {
1916  // reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsReduction; //moved to launch_part2, startRun2
1917  // where do I get the margin violations?
1918  // reduction->item(REDUCTION_MARGIN_VIOLATIONS) += marginViolations;
1919  if(doEnergy){
1920  if(doMomentum){
1921  // JM NOTE: Calculates momenta if copyOut
1922  CUDASequencerKernel->submitReduction1(
1923  d_pos_x, d_pos_y, d_pos_z,
1924  d_vel_x, d_vel_y, d_vel_z, d_mass,
1925  d_kineticEnergy,
1926  d_momentum_x, d_momentum_y, d_momentum_z,
1927  d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
1928  origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
1929  momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
1930  numAtomsReduction, stream);
1931  }
1932  Tensor regintVirialNormal;
1933  Tensor regintVirialNbond;
1934  Tensor regintVirialSlow;
1935 
1936 #if 0
1937  cudaCheck(cudaEventRecord(eventStart,stream));
1938 #endif
1939  CUDASequencerKernel->submitReduction2(
1940  simParams->fixedAtomsOn, d_atomFixed,
1941  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
1942  d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
1943  d_f_normal_x, d_f_normal_y, d_f_normal_z,
1944  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1945  d_f_slow_x, d_f_slow_y, d_f_slow_z,
1946  d_mass, d_hydrogenGroupSize,
1947  d_kineticEnergy, kineticEnergy,
1948  d_intKineticEnergy, intKineticEnergy,
1949  d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
1950  intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
1951  d_tbcatomic, numAtomsReduction, maxForceNumber, simParams->isMultiTimeStepping(), stream);
1952  // Haochuan: actually should be doVirial here
1953  if (simParams->fixedAtomsOn) {
1954  CUDASequencerKernel->calcFixVirial(
1955  maxForceNumber, numAtomsReduction, d_atomFixed,
1956  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
1957  d_f_normal_x, d_f_normal_y, d_f_normal_z,
1958  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1959  d_f_slow_x, d_f_slow_y, d_f_slow_z,
1960  d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
1961  d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
1962  }
1963  }
1964 #if 0
1965  cudaCheck(cudaEventRecord(eventStop, stream));
1966  cudaCheck(cudaEventSynchronize(eventStop));
1967  cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
1968  fprintf(stderr, "submitReductions2 total elapsed time: %f\n", t_submitReductions2);
1969  t_submitReductions2 = 0;
1970 #endif
1971 }
1972 
1973 void SequencerCUDA::copySettleParameter(){
1974  // Searching for a patch that contains initialized settle parameters
1975  cudaCheck(cudaSetDevice(deviceID));
1976  if(simParams->rigidBonds != RIGID_NONE){
1977  HomePatch *patch = NULL;
1978  // PatchList contains all patches in the node, so if there's a single water in the system,
1979  // this is guaranteed to catch it
1980  for(int i = 0; i < patchList.size(); i++){
1981  if(patchList[i]->settle_initialized) {
1982  patch = patchList[i];
1983  break;
1984  }
1985  }
1986  if ( patch ) {
1987  SettleParameters h_sp;
1988  h_sp.mO = patch->settle_mO;
1989  h_sp.mH = patch->settle_mH;
1990  h_sp.mOrmT = patch->settle_mOrmT;
1991  h_sp.mHrmT = patch->settle_mHrmT;
1992  h_sp.rra = patch->settle_rra;
1993  h_sp.ra = patch->settle_ra;
1994  h_sp.rb = patch->settle_rb;
1995  h_sp.rc = patch->settle_rc;
1996  h_sp.r_om = patch->r_om;
1997  h_sp.r_ohc = patch->r_ohc;
1998 
1999  // fprintf(stderr, "SETTLEPARAMETER Found: Values %lf %lf %lf %lf %lf %lf %lf %lf\n",
2000  // h_sp.mO, h_sp.mH, h_sp.mOrmT, h_sp.mHrmT, h_sp.rra, h_sp.ra, h_sp.rb, h_sp.rc );
2001  copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
2002  }
2003  }
2004 }
2005 
2006 // Does rattle1_SOA
2007 void SequencerCUDA::startRun1(
2008  int maxForceNumber,
2009  const Lattice& lattice
2010 ) {
2011  // cudaCheck(cudaSetDevice(deviceID));
2012  myLattice = lattice;
2013 
2014  // JM: Enforcing rigid bonds on first iteration
2015  CUDASequencerKernel->rattle1(
2016  simParams->fixedAtomsOn, 1, 0,
2017  numAtomsHome, 0.f, 0.f,
2018  2.0 * simParams->rigidTol,
2019  d_vel_x, d_vel_y, d_vel_z,
2020  d_pos_x, d_pos_y, d_pos_z,
2021  d_velNew_x, d_velNew_y, d_velNew_z,
2022  d_posNew_x, d_posNew_y, d_posNew_z,
2023  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2024  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2025  &settleList, settleListSize, &d_consFailure,
2026  d_consFailureSize, &rattleList, rattleListSize,
2027  &nSettle, &nRattle,
2028  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2029  buildRigidLists, consFailure, simParams->watmodel, stream);
2030 
2031  this->copyPositionsAndVelocitiesToHost(1, 0);
2032  cudaCheck(cudaDeviceSynchronize());
2033 #if 0
2034  printSOAPositionsAndVelocities();
2035 #endif
2036 }
2037 
2038 void SequencerCUDA::startRun2(
2039  double dt_normal,
2040  double dt_nbond,
2041  double dt_slow,
2042  Vector origin,
2043  NodeReduction *reduction,
2044  int doGlobal,
2045  int maxForceNumber
2046 ){
2047  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2048 
2049  // This is a patch-based kernel -> which means each threadblock deals with an entire patch
2050  // So, in the multigpu case, we want to keep non-offset pointers but we want
2051  // to deal only with a handful of patches
2052 
2053  // We dont need to and we should not set normal force to zero
2054  // It stores global forces. Additionally, we set normall forces,
2055  // every step, it's not an addition
2056  // cudaCheck(cudaMemset(d_f_normal_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2057  // cudaCheck(cudaMemset(d_f_normal_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2058  // cudaCheck(cudaMemset(d_f_normal_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2059 
2060  cudaCheck(cudaMemset(d_f_nbond_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2061  cudaCheck(cudaMemset(d_f_nbond_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2062  cudaCheck(cudaMemset(d_f_nbond_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2063 
2064  cudaCheck(cudaMemset(d_f_slow_x, 0, sizeof(double)*numAtomsHomeAndProxy));
2065  cudaCheck(cudaMemset(d_f_slow_y, 0, sizeof(double)*numAtomsHomeAndProxy));
2066  cudaCheck(cudaMemset(d_f_slow_z, 0, sizeof(double)*numAtomsHomeAndProxy));
2067 
2068  CUDASequencerKernel->accumulateForceToSOA(
2069  doGlobal,
2070  simParams->useCudaGlobal,
2071  maxForceNumber,
2072  numPatchesHomeAndProxy,
2073  nDevices,
2074  patchData->devData[deviceIndex].d_localPatches,
2075  patchData->devData[deviceIndex].f_bond,
2076  patchData->devData[deviceIndex].f_bond_nbond,
2077  patchData->devData[deviceIndex].f_bond_slow,
2078  patchData->devData[deviceIndex].forceStride,
2079  patchData->devData[deviceIndex].f_nbond,
2080  patchData->devData[deviceIndex].f_nbond_slow,
2081  patchData->devData[deviceIndex].f_slow,
2082  d_f_global_x,
2083  d_f_global_y,
2084  d_f_global_z,
2085  d_f_normal_x,
2086  d_f_normal_y,
2087  d_f_normal_z,
2088  d_f_nbond_x,
2089  d_f_nbond_y,
2090  d_f_nbond_z,
2091  d_f_slow_x,
2092  d_f_slow_y,
2093  d_f_slow_z,
2094  d_unsortOrder,
2095  myLattice,
2096  patchData->d_queues,
2097  patchData->d_queueCounters,
2098  d_tbcatomic,
2099  stream
2100  );
2101  if(mGpuOn){
2102  // Synchonize device before node barrier
2103  cudaCheck(cudaDeviceSynchronize());
2104  }
2105 
2106 #if 0
2107  printSOAPositionsAndVelocities();
2108 #endif
2109 }
2110 
2111 void SequencerCUDA::startRun3(
2112  double dt_normal,
2113  double dt_nbond,
2114  double dt_slow,
2115  Vector origin,
2116  NodeReduction *reduction,
2117  const bool requestGlobalForces,
2118  int doGlobalMasterStaleForces,
2119  const bool requestForcesOutput,
2120  const bool requestGlobalForcesGPU,
2121  int maxForceNumber
2122 ){
2123  const bool doFixed = simParams->fixedAtomsOn;
2124  if(mGpuOn){
2125  // XXX TODO we need to call the force merging kernel here
2126 #if 1
2127  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2128  std::vector<int> atom_counts;
2129  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2130  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2131  }
2132  CUDASequencerKernel->mergeForcesFromPeers(
2133  deviceIndex,
2134  maxForceNumber,
2135  myLattice,
2136  numPatchesHomeAndProxy,
2137  numPatchesHome,
2138  this->d_peer_fb_x,
2139  this->d_peer_fb_y,
2140  this->d_peer_fb_z,
2141  this->d_peer_fn_x,
2142  this->d_peer_fn_y,
2143  this->d_peer_fn_z,
2144  this->d_peer_fs_x,
2145  this->d_peer_fs_y,
2146  this->d_peer_fs_z,
2147  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2148  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2149  patchData->devData[deviceIndex].d_localPatches,
2150  patchData->devData[deviceIndex].d_peerPatches,
2151  atom_counts,
2152  stream
2153  );
2154 #else
2155 
2156  // Before I call nccl, let's see the forces here
2157  // ncclAllReduce(d_f_normal_x, d_f_normal_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2158 
2159  // ncclAllReduce(d_f_normal_y, d_f_normal_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2160  // ncclAllReduce(d_f_normal_z, d_f_normal_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2161  // ncclAllReduce(d_f_nbond_x, d_f_nbond_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2162  // ncclAllReduce(d_f_nbond_y, d_f_nbond_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2163  // ncclAllReduce(d_f_nbond_z, d_f_nbond_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2164  // ncclAllReduce(d_f_slow_x, d_f_slow_x, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2165  // ncclAllReduce(d_f_slow_y, d_f_slow_y, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2166  // ncclAllReduce(d_f_slow_z, d_f_slow_z, numAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream);
2167  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2168  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2169 #endif
2170  }
2171  if(doGlobalMasterStaleForces)
2172  {
2173  computeGlobalMasterVirial(
2174  numPatchesHomeAndProxy,
2175  patchData->devData[deviceIndex].d_localPatches,
2176  d_pos_x,
2177  d_pos_y,
2178  d_pos_z,
2179  d_transform,
2180  f_global_x,
2181  f_global_y,
2182  f_global_z,
2183  &d_extForce[EXT_GLOBALMTS],
2184  &extForce[EXT_GLOBALMTS],
2185  &d_extVirial[EXT_GLOBALMTS],
2186  &extVirial[EXT_GLOBALMTS],
2187  myLattice,
2188  d_tbcatomic,
2189  stream);
2190  // ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
2191  // ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
2192  }
2193 
2194  // do external forces calculation and store energy and virial
2195  calculateExternalForces(simParams->firstTimestep, reduction, maxForceNumber, 1, 1);
2196 #if 0
2197  cudaCheck(cudaDeviceSynchronize());
2198  if(true || deviceID == 0){
2199  char prefix[10];
2200  snprintf(prefix, 10, "step-%d",0);
2201  this->printSOAForces(prefix);
2202  }
2203 #endif
2204 
2205  CUDASequencerKernel->addForceToMomentum(
2206  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2207  d_recipMass,
2208  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2209  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2210  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2211  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2212  numAtomsHome, maxForceNumber, stream);
2213 
2214  CUDASequencerKernel->rattle1(
2215  simParams->fixedAtomsOn,1, 0,
2216  numAtomsHome, -dt_normal, -1.0/(dt_normal),
2217  2.0 * simParams->rigidTol,
2218  d_vel_x, d_vel_y, d_vel_z,
2219  d_pos_x, d_pos_y, d_pos_z,
2220  d_velNew_x, d_velNew_y, d_velNew_z,
2221  d_posNew_x, d_posNew_y, d_posNew_z,
2222  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2223  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2224  &settleList, settleListSize, &d_consFailure,
2225  d_consFailureSize, &rattleList, rattleListSize,
2226  &nSettle, &nRattle,
2227  d_rigidVirial, rigidVirial, d_tbcatomic, true, sp,
2228  true, consFailure, simParams->watmodel, stream);
2229 
2230  CUDASequencerKernel->centerOfMass(
2231  d_vel_x, d_vel_y, d_vel_z,
2232  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2233  d_hydrogenGroupSize, numAtomsHome, stream);
2234 
2235 
2236  {
2237  // SubmitHalf and its corresponding reductions
2238  submitHalf(reduction, numAtomsHome, 1, 1);
2239  // submitHalf reductions
2240  cudaCheck(cudaStreamSynchronize(stream));
2241  Tensor reduction_virial;
2242  Tensor reduction_intVirialNormal;
2243  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2244  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2245  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2246  // Haochuan: the tensor is not symmetric when there are fixed atoms
2247  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2248  reduction_virial *= 0.5;
2249  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2250  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2252  += (intKineticEnergy_half[0] * 0.25);
2253  reduction_intVirialNormal *= 0.5;
2254  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2255  reduction_intVirialNormal);
2256  }
2257 
2258  CUDASequencerKernel->addForceToMomentum(
2259  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
2260  d_recipMass,
2261  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2262  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2263  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2264  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2265  numAtomsHome, maxForceNumber, stream);
2266 
2267  CUDASequencerKernel->rattle1(
2268  simParams->fixedAtomsOn, 1, 1,
2269  numAtomsHome, dt_normal, 1.0/dt_normal,
2270  2.0 * simParams->rigidTol,
2271  d_vel_x, d_vel_y, d_vel_z,
2272  d_pos_x, d_pos_y, d_pos_z,
2273  d_velNew_x, d_velNew_y, d_velNew_z,
2274  d_posNew_x, d_posNew_y, d_posNew_z,
2275  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2276  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2277  &settleList, settleListSize, &d_consFailure,
2278  d_consFailureSize, &rattleList, rattleListSize,
2279  &nSettle, &nRattle,
2280  d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2281  buildRigidLists, consFailure, simParams->watmodel, stream);
2282 
2283  CUDASequencerKernel->centerOfMass(
2284  d_vel_x, d_vel_y, d_vel_z,
2285  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2286  d_hydrogenGroupSize, numAtomsHome, stream);
2287 
2288  {
2289  // JM: SubmitHalf and its corresponding reductions
2290  submitHalf(reduction, numAtomsHome, 1, 1);
2291  // submitHalf reductions
2292  cudaCheck(cudaStreamSynchronize(stream));
2293  Tensor reduction_virial;
2294  Tensor reduction_intVirialNormal;
2295  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2296  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2297  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2298  // Haochuan: the tensor is not symmetric when there are fixed atoms
2299  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2300  reduction_virial *= 0.5;
2301  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2302  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
2304  += (intKineticEnergy_half[0] * 0.25);
2305  reduction_intVirialNormal *= 0.5;
2306  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2307  reduction_intVirialNormal);
2308  }
2309 
2310  CUDASequencerKernel->addForceToMomentum(
2311  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2312  d_recipMass,
2313  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2314  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2315  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2316  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2317  numAtomsHome, maxForceNumber, stream);
2318 
2319  if(requestGlobalForces || requestForcesOutput) {
2320  // store the forces for next step,
2321  // when we need it for colvars and Tcl scripting
2322  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
2323  }
2324 
2325  if (requestGlobalForcesGPU) {
2326  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
2327  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
2328  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
2329  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
2330  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
2331  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
2332  CUDASequencerKernel->copyForcesToDevice(
2333  numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2334  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2335  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
2336  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
2337  // cudaCheck(cudaStreamSynchronize(stream));
2338  }
2339 
2340  CUDASequencerKernel->centerOfMass(
2341  d_vel_x, d_vel_y, d_vel_z,
2342  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2343  d_hydrogenGroupSize, numAtomsHome, stream);
2344 
2345  submitReductions(reduction, origin.x, origin.y, origin.z,
2346  marginViolations, 1,
2347  1,
2348  numAtomsHome, maxForceNumber);
2349 
2350  copyPositionsAndVelocitiesToHost(1, 0);
2351 
2352  if(consFailure[0]){
2353  // Constraint failure. Abort.
2354  int dieOnError = simParams->rigidDie;
2355  if(dieOnError){
2356  // Bails out
2357  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
2358  NAMD_die("constraint failure during CUDA rattle!\n");
2359  }else{
2360  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
2361  }
2362  }else if(1){
2363  cudaCheck(cudaStreamSynchronize(stream));
2364  if(simParams->rigidBonds != RIGID_NONE){
2365  Tensor reduction_rigidVirial;
2366  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
2367  // Haochuan: the tensor is not symmetric when there are fixed atoms
2368  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
2369  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
2370  }
2371 
2372  //submitReductions1
2373  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
2374  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
2375  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
2376  Vector angularMomentum(*angularMomentum_x,
2377  *angularMomentum_y,
2378  *angularMomentum_z);
2379  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
2380  //submitReductions2
2381  Tensor regintVirialNormal;
2382  Tensor regintVirialNbond;
2383  Tensor regintVirialSlow;
2384  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
2385  if (maxForceNumber >= 1) {
2386  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
2387  }
2388  if (maxForceNumber >= 2) {
2389  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
2390  }
2391 
2392  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
2393  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
2394  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
2395  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
2396 
2397  if (simParams->fixedAtomsOn) {
2398  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
2399  double3 fixForceNormal, fixForceNbond, fixForceSlow;
2400  switch (maxForceNumber) {
2401  case 2: {
2402  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
2403  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
2404  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
2405  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
2406  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
2407  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
2408  } // intentionally fallthrough
2409  case 1: {
2410  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
2411  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
2412  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
2413  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
2414  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
2415  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
2416  } // intentionally fallthrough
2417  default: {
2418  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
2419  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
2420  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
2421  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
2422  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
2423  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
2424  }
2425  }
2426 #if 0
2427  auto printTensor = [](const cudaTensor& t, const std::string& name){
2428  CkPrintf("%s", name.c_str());
2429  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
2430  "%12.5lf %12.5lf %12.5lf\n"
2431  "%12.5lf %12.5lf %12.5lf\n",
2432  t.xx, t.xy, t.xz,
2433  t.yx, t.yy, t.yz,
2434  t.zx, t.zy, t.zz);
2435  };
2436  printTensor(fixVirialNormal, "fixVirialNormal = ");
2437  printTensor(fixVirialNbond, "fixVirialNbond = ");
2438  printTensor(fixVirialSlow, "fixVirialSlow = ");
2439 #endif
2440 
2441  }
2442  }
2443 
2444 #if 0
2445  if(deviceID == 0){
2446  this->printSOAForces(NULL);
2447  }
2448 #endif
2449 
2450 #if 0
2451  printSOAPositionsAndVelocities();
2452 #endif
2453 }
2454 
2455 void SequencerCUDA::monteCarloPressure_reject(Lattice &lattice)
2456 {
2457  // Restore the myLattice
2458  myLattice = lattice;
2459  double *temp;
2460 
2461  // Restore positions and forces
2462  temp = d_f_normal_x; d_f_normal_x = d_f_normalMC_x; d_f_normalMC_x = temp;
2463  temp = d_f_normal_y; d_f_normal_y = d_f_normalMC_y; d_f_normalMC_y = temp;
2464  temp = d_f_normal_z; d_f_normal_z = d_f_normalMC_z; d_f_normalMC_z = temp;
2465  temp = d_f_nbond_x; d_f_nbond_x = d_f_nbondMC_x; d_f_nbondMC_x = temp;
2466  temp = d_f_nbond_y; d_f_nbond_y = d_f_nbondMC_y; d_f_nbondMC_y = temp;
2467  temp = d_f_nbond_z; d_f_nbond_z = d_f_nbondMC_z; d_f_nbondMC_z = temp;
2468  temp = d_f_slow_x; d_f_slow_x = d_f_slowMC_x; d_f_slowMC_x = temp;
2469  temp = d_f_slow_y; d_f_slow_y = d_f_slowMC_y; d_f_slowMC_y = temp;
2470  temp = d_f_slow_z; d_f_slow_z = d_f_slowMC_z; d_f_slowMC_z = temp;
2471  #ifdef NAMD_NCCL_ALLREDUCE
2472  if(mGpuOn) {
2473  temp = d_posNew_x; d_posNew_x = d_posMC_x; d_posMC_x = temp;
2474  temp = d_posNew_y; d_posNew_y = d_posMC_y; d_posMC_y = temp;
2475  temp = d_posNew_z; d_posNew_z = d_posMC_z; d_posMC_z = temp;
2476  } else {
2477  temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2478  temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2479  temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2480  }
2481  #else
2482  temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2483  temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2484  temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2485  #endif
2486 }
2487 
2488 void SequencerCUDA::monteCarloPressure_accept(
2489  NodeReduction *reduction,
2490  const int doMigration)
2491 {
2492  // do we need to update center of masses?
2493  CUDASequencerKernel->centerOfMass(
2494  d_pos_x, d_pos_y, d_pos_z,
2495  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2496  d_hydrogenGroupSize, numAtomsHome, stream);
2497 
2498  // Add half step kinetic contribution to energy, intEnergy, virial, intVirial,
2499  // calculated by submitHalf in launch_part11
2500  Tensor reduction_virial;
2501  Tensor reduction_intVirialNormal;
2502  COPY_CUDATENSOR(virial_half[0], reduction_virial);
2503  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
2504  // Haochuan: the tensor is not symmetric when there are fixed atoms
2505  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2506  reduction_virial *= 0.5;
2507  reduction_intVirialNormal *= 0.5;
2508  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
2509  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
2510  reduction_intVirialNormal);
2511  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
2512  reduction->item(REDUCTION_INT_HALFSTEP_KINETIC_ENERGY) += (intKineticEnergy_half[0] * 0.25);
2513 
2514  // If we were on migration steps and move was accepted, we need to update
2515  // the myLatticeOld in order to use it in SequencerCUDAKernel::pairlistCheck();
2516  if(doMigration) {
2517  myLatticeOld = myLattice;
2518  }
2519 }
2520 
2521 void SequencerCUDA::monteCarloPressure_part1(
2522  Tensor &factor,
2523  Vector &origin,
2524  Lattice &oldLattice)
2525 {
2526  // Backup positions and forces
2527  //copy_DtoD<double>(d_f_raw, d_f_rawMC, numAtoms*9, stream);
2528  copy_DtoD<double>(d_f_normal_x, d_f_normalMC_x, numAtomsHome, stream);
2529  copy_DtoD<double>(d_f_normal_y, d_f_normalMC_y, numAtomsHome, stream);
2530  copy_DtoD<double>(d_f_normal_z, d_f_normalMC_z, numAtomsHome, stream);
2531  copy_DtoD<double>(d_f_nbond_x, d_f_nbondMC_x, numAtomsHome, stream);
2532  copy_DtoD<double>(d_f_nbond_y, d_f_nbondMC_y, numAtomsHome, stream);
2533  copy_DtoD<double>(d_f_nbond_z, d_f_nbondMC_z, numAtomsHome, stream);
2534  copy_DtoD<double>(d_f_slow_x, d_f_slowMC_x, numAtomsHome, stream);
2535  copy_DtoD<double>(d_f_slow_y, d_f_slowMC_y, numAtomsHome, stream);
2536  copy_DtoD<double>(d_f_slow_z, d_f_slowMC_z, numAtomsHome, stream);
2537 #ifdef NAMD_NCCL_ALLREDUCE
2538  if(mGpuOn) {
2539  //copy_DtoD<double>(d_posNew_raw, d_pos_rawMC, numAtomsHome*3, stream);
2540  copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
2541  copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
2542  copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
2543  } else {
2544  //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
2545  copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2546  copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2547  copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2548  }
2549 #else
2550  //copy_DtoD<double>(d_pos_raw, d_pos_rawMC, numAtomsHome*3, stream);
2551  copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2552  copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2553  copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2554 #endif
2555 
2556  // Scale the old lattice with factor. We need both lattice and newLattice
2557  // to properly unwrap and wrap the atom's coordinate
2558  Lattice newLattice = oldLattice;
2559  newLattice.rescale(factor);
2560  cudaTensor cuFactor;
2561  cudaVector cuOrigin;
2562  COPY_CUDATENSOR(factor, cuFactor);
2563  COPY_CUDAVECTOR(origin, cuOrigin);
2564 
2565  // Scale the coordinate using Molecule's geometric center
2566  Molecule *molecule = Node::Object()->molecule;
2567  CUDASequencerKernel->scaleCoordinateUsingGC(
2568  d_pos_x, d_pos_y, d_pos_z, d_idOrder, d_moleculeStartIndex,
2569  d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
2570  d_transform, molecule->numMolecules, molecule->numLargeMolecules,
2571  stream);
2572 
2573  // Update the cuda lattice with newLattice for force calculation
2574  myLattice = newLattice;
2575 
2576  // Set up compute position before calling bonded and nonbonded kernel
2577  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2579  bool doNbond = patchData->flags.doNonbonded;
2580  bool doSlow = patchData->flags.doFullElectrostatics;
2581  bool doFEP = false;
2582  bool doTI = false;
2583  bool doAlchDecouple = false;
2584  bool doAlchSoftCore = false;
2585  if (simParams->alchOn) {
2586  if (simParams->alchFepOn) doFEP = true;
2587  if (simParams->alchThermIntOn) doTI = true;
2588  if (simParams->alchDecouple) doAlchDecouple = true;
2589  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2590  }
2591 
2592  if (Node::Object()->molecule->is_lonepairs_psf) {
2593  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
2594  }
2595 
2596  bool usePatchPme = false;
2597  if (deviceCUDA->getIsPmeDevice() && doSlow) {
2598  // This will check if the current lattice is compatible with the patch-level PME kernels
2599  // This needs to be redone every time the lattice changes, the results are stored in
2600  // the cudaPme object, and the overall compatibility can be computed with the compatible()
2601  // function.
2602  //
2603  // The behavior of set compute positions PME is different depending on the kernels being used,
2604  // so that value needs to be passed to the kernel object
2606  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
2608  myLattice,
2609  getNumPatchesHome(),
2610  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
2611  getHostPatchMin(),
2612  getHostPatchMax(),
2613  getHostAwayDists()
2614  );
2615  usePatchPme = cudaPme->patchLevelPmeData.compatible();
2616  }
2617 
2618  std::vector<int> atom_counts;
2619  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2620  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2621  }
2622  CUDASequencerKernel->set_compute_positions(
2623  deviceIndex,
2625  nDevices,
2626  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
2627  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
2628 #ifdef NAMD_NCCL_ALLREDUCE
2629  (mGpuOn) ? d_posNew_x: d_pos_x,
2630  (mGpuOn) ? d_posNew_y: d_pos_y,
2631  (mGpuOn) ? d_posNew_z: d_pos_z,
2632 #else
2633  d_pos_x,
2634  d_pos_y,
2635  d_pos_z,
2636  d_peer_pos_x, // passes double-pointer if mgpuOn
2637  d_peer_pos_y,
2638  d_peer_pos_z,
2639  d_peer_charge,
2640  d_peer_partition,
2641 #endif
2642  d_charge, d_partition, charge_scaling,
2643  d_patchCenter,
2644  patchData->devData[deviceIndex].slow_patchPositions,
2645  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
2646  d_sortOrder, newLattice,
2647  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
2648  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
2650  patchData->devData[deviceIndex].d_localPatches,
2651  patchData->devData[deviceIndex].d_peerPatches,
2652  atom_counts,
2653  stream);
2654 
2655  cudaCheck(cudaStreamSynchronize(stream));
2656 }
2657 
2658 void SequencerCUDA::monteCarloPressure_part2(
2659  NodeReduction *reduction,
2660  int step,
2661  int maxForceNumber,
2662  const bool doEnergy,
2663  const bool doGlobal,
2664  const bool doVirial)
2665 {
2666  // we zero all reduction value in part1. Need to add this
2667  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
2668 
2669  if(mGpuOn){
2670 #ifdef NAMD_NCCL_ALLREDUCE
2671  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtoms*3*(maxForceNumber+1)));
2672 #endif
2673  }
2674  //Update SOA buffer
2675  CUDASequencerKernel->accumulateForceToSOA(
2676  doGlobal,
2677  simParams->useCudaGlobal,
2678  maxForceNumber,
2679  numPatchesHomeAndProxy,
2680  nDevices,
2681  patchData->devData[deviceIndex].d_localPatches,
2682  patchData->devData[deviceIndex].f_bond,
2683  patchData->devData[deviceIndex].f_bond_nbond,
2684  patchData->devData[deviceIndex].f_bond_slow,
2685  patchData->devData[deviceIndex].forceStride,
2686  patchData->devData[deviceIndex].f_nbond,
2687  patchData->devData[deviceIndex].f_nbond_slow,
2688  patchData->devData[deviceIndex].f_slow,
2689  d_f_global_x,
2690  d_f_global_y,
2691  d_f_global_z,
2692  d_f_normal_x,
2693  d_f_normal_y,
2694  d_f_normal_z,
2695  d_f_nbond_x,
2696  d_f_nbond_y,
2697  d_f_nbond_z,
2698  d_f_slow_x,
2699  d_f_slow_y,
2700  d_f_slow_z,
2701  d_unsortOrder,
2702  myLattice,
2703  patchData->d_queues,
2704  patchData->d_queueCounters,
2705  d_tbcatomic,
2706  stream
2707  );
2708  if(mGpuOn){
2709 #ifndef NAMD_NCCL_ALLREDUCE
2710  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
2711  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
2712  // 0, deviceCUDA->getNcclComm(), stream);
2713  std::vector<int> atom_counts;
2714  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
2715  atom_counts.push_back(patchData->devData[i].numAtomsHome);
2716  }
2717  CUDASequencerKernel->mergeForcesFromPeers(
2718  deviceIndex,
2719  maxForceNumber,
2720  myLattice,
2721  numPatchesHomeAndProxy,
2722  numPatchesHome,
2723  this->d_peer_fb_x,
2724  this->d_peer_fb_y,
2725  this->d_peer_fb_z,
2726  this->d_peer_fn_x,
2727  this->d_peer_fn_y,
2728  this->d_peer_fn_z,
2729  this->d_peer_fs_x,
2730  this->d_peer_fs_y,
2731  this->d_peer_fs_z,
2732  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
2733  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
2734  patchData->devData[deviceIndex].d_localPatches,
2735  patchData->devData[deviceIndex].d_peerPatches,
2736  atom_counts,
2737  stream
2738  );
2739 #else
2740  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2741  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
2742 #endif
2743  }
2744  // do external forces calculation
2745  calculateExternalForces(step, reduction, maxForceNumber, doEnergy, doVirial);
2746 #if 0
2747  cudaCheck(cudaDeviceSynchronize());
2748  if(true || deviceID == 0){
2749  char prefix[10];
2750  snprintf(prefix, 10, "step-%d",step);
2751  this->printSOAForces(prefix);
2752  }
2753 #endif
2754 
2755 }
2756 
2757 void SequencerCUDA::launch_part1(
2758  int step,
2759  double dt_normal,
2760  double dt_nbond,
2761  double dt_slow,
2762  double velrescaling,
2763  const double maxvel2,
2764  NodeReduction *reduction,
2765  Tensor &factor,
2766  Vector &origin,
2767  Lattice &lattice,
2768  int reassignVelocitiesStep,
2769  int langevinPistonStep,
2770  int berendsenPressureStep,
2771  int maxForceNumber,
2772  const int copyIn,
2773  const int savePairlists,
2774  const int usePairlists,
2775  const bool doEnergy)
2776 {
2777  PatchMap* patchMap = PatchMap::Object();
2778  // Aggregate data from all patches
2779  cudaCheck(cudaSetDevice(deviceID));
2780  this->maxvel2 = maxvel2;
2781  const bool doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
2782  const bool doFixed = simParams->fixedAtomsOn;
2783  // JM: for launch_part1:
2784  // copyIn: first call
2785  myLattice = lattice;
2786  if(reassignVelocitiesStep)
2787  {
2788  const int reassignFreq = simParams->reassignFreq;
2789  BigReal newTemp = simParams->reassignTemp;
2790  newTemp += ( step / reassignFreq ) * simParams->reassignIncr;
2791  if ( simParams->reassignIncr > 0.0 ) {
2792  if ( newTemp > simParams->reassignHold && simParams->reassignHold > 0.0 )
2793  newTemp = simParams->reassignHold;
2794  } else {
2795  if ( newTemp < simParams->reassignHold )
2796  newTemp = simParams->reassignHold;
2797  }
2798  const BigReal kbT = BOLTZMANN * newTemp;
2799 
2800  CUDASequencerKernel->reassignVelocities(
2801  dt_normal, simParams->fixedAtomsOn, d_atomFixed,
2802  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
2803  d_vel_x, d_vel_y, d_vel_z,
2804  d_recipMass, kbT,
2805  numAtomsHome, numAtomsHome, 0,
2806  curandGen, stream);
2807  }
2808 
2809  // scale the position for berendsen Pressure Controller
2810  if(berendsenPressureStep) {
2811  cudaTensor cuFactor;
2812  cudaVector cuOrigin;
2813  COPY_CUDATENSOR(factor, cuFactor);
2814  COPY_CUDAVECTOR(origin, cuOrigin);
2815  CUDASequencerKernel->scaleCoordinateWithFactor(
2816  d_pos_x, d_pos_y, d_pos_z, d_mass, d_hydrogenGroupSize,
2817  cuFactor, cuOrigin, simParams->useGroupPressure, numAtomsHome, stream);
2818  }
2819 
2820  if(!langevinPistonStep){
2821  // kernel fusion here
2822  // JM TODO: Fuse kernels for the langevin thermostat
2823  CUDASequencerKernel->velocityVerlet1(
2824  doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
2825  dt_slow, velrescaling, d_recipMass,
2826  d_vel_x, d_vel_y, d_vel_z, maxvel2, killme, d_pos_x, d_pos_y, d_pos_z,
2827  pos_x, pos_y, pos_z, d_f_normal_x, d_f_normal_y, d_f_normal_z,
2828  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z,
2829  d_atomFixed, numAtomsHome, maxForceNumber, stream);
2830  }else{
2831  // Zero-out force buffers here
2832  CUDASequencerKernel->addForceToMomentum(
2833  doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
2834  d_recipMass,
2835  d_f_normal_x, d_f_normal_y, d_f_normal_z,
2836  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2837  d_f_slow_x, d_f_slow_y, d_f_slow_z,
2838  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2839  numAtomsHome, maxForceNumber, stream);
2840 
2841  maximumMove(maxvel2, numAtomsHome);
2842  cudaTensor cuFactor;
2843  cudaVector cuOrigin;
2844  COPY_CUDATENSOR(factor, cuFactor);
2845  COPY_CUDAVECTOR(origin, cuOrigin);
2846  double velFactor_x = namd_reciprocal(factor.xx);
2847  double velFactor_y = namd_reciprocal(factor.yy);
2848  double velFactor_z = namd_reciprocal(factor.zz);
2849 
2850  CUDASequencerKernel->addVelocityToPosition(
2851  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2852  d_vel_x, d_vel_y, d_vel_z,
2853  d_pos_x, d_pos_y, d_pos_z,
2854  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2855  CUDASequencerKernel->langevinPiston(
2856  simParams->fixedAtomsOn, d_atomFixed,
2857  d_groupFixed, d_transform, lattice,
2858  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
2859  d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
2860  d_mass, d_hydrogenGroupSize,
2861  cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
2862  simParams->useGroupPressure, numAtomsHome, stream);
2863  CUDASequencerKernel->addVelocityToPosition(
2864  simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2865  d_vel_x, d_vel_y, d_vel_z,
2866  d_pos_x, d_pos_y, d_pos_z,
2867  pos_x, pos_y, pos_z, numAtomsHome, false, stream);
2868  }
2869 
2870  // JM: Recalculate centers of mass if energy calculation or langevinPistonOn
2871  if( (doEnergy || doVirial) ) {
2872  CUDASequencerKernel->centerOfMass(
2873  d_pos_x, d_pos_y, d_pos_z,
2874  d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2875  d_hydrogenGroupSize, numAtomsHome, stream);
2876  CUDASequencerKernel->centerOfMass(
2877  d_vel_x, d_vel_y, d_vel_z,
2878  d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2879  d_hydrogenGroupSize, numAtomsHome, stream);
2880  }
2881 
2882  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2884  // We need to find doNbond and doSlow for upcoming step
2885  bool doNbond = patchData->flags.doNonbonded;
2886  bool doSlow = patchData->flags.doFullElectrostatics;
2887 
2888  bool doFEP = false;
2889  bool doTI = false;
2890  bool doAlchDecouple = false;
2891  bool doAlchSoftCore = false;
2892  if (simParams->alchOn) {
2893  if (simParams->alchFepOn) doFEP = true;
2894  if (simParams->alchThermIntOn) doTI = true;
2895  if (simParams->alchDecouple) doAlchDecouple = true;
2896  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2897  }
2898  if ( ! savePairlists ) {
2899 
2900  double minSize = simParams->patchDimension - simParams->margin;
2901  double sysdima = lattice.a_r().unit() * lattice.a();
2902  double sysdimb = lattice.b_r().unit() * lattice.b();
2903  double sysdimc = lattice.c_r().unit() * lattice.c();
2904  // Let's pass migrationInfo here
2905 
2906  //EJB lets assign this before passing it into a function, because it is
2907  //never initialized
2908  rescalePairlistTolerance = false;
2909  CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
2910  patchData->devData[deviceIndex].d_localPatches,
2911  d_pos_x, d_pos_y, d_pos_z, d_posSave_x, d_posSave_y, d_posSave_z,
2912  d_awayDists,
2913  myLattice, myLatticeOld,
2914  d_patchMin, d_patchMax, d_patchCenter,
2915  d_mInfo,
2916  d_tbcatomic, simParams->pairlistTrigger,
2917  simParams->pairlistGrow, simParams->pairlistShrink,
2918  d_patchMaxAtomMovement, patchMaxAtomMovement,
2919  d_patchNewTolerance, patchNewTolerance,
2920  minSize, simParams->cutoff, sysdima, sysdimb, sysdimc,
2921  h_marginViolations,
2922  h_periodicCellSmall,
2923  rescalePairlistTolerance,
2924  isPeriodic, stream);
2925 
2926  }
2927  else {
2928  rescalePairlistTolerance = true;
2929  }
2930 
2931  if(mGpuOn){
2932  // Synchonize device before node barrier
2933  cudaCheck(cudaStreamSynchronize(stream));
2934  }
2935 }
2936 
2937 void SequencerCUDA::launch_part11(
2938  double dt_normal,
2939  double dt_nbond,
2940  double dt_slow,
2941  double velrescaling,
2942  const double maxvel2,
2943  NodeReduction *reduction,
2944  Tensor &factor,
2945  Vector &origin,
2946  Lattice &lattice,
2947  int langevinPistonStep,
2948  int maxForceNumber,
2949  const int copyIn,
2950  const int savePairlists,
2951  const int usePairlists,
2952  const bool doEnergy)
2953 {
2954  const bool doVirial = simParams->langevinPistonOn;
2955  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2957  // We need to find doNbond and doSlow for upcoming step
2958  bool doNbond = patchData->flags.doNonbonded;
2959  bool doSlow = patchData->flags.doFullElectrostatics;
2960 
2961  bool doFEP = false;
2962  bool doTI = false;
2963  bool doAlchDecouple = false;
2964  bool doAlchSoftCore = false;
2965  if (simParams->alchOn) {
2966  if (simParams->alchFepOn) doFEP = true;
2967  if (simParams->alchThermIntOn) doTI = true;
2968  if (simParams->alchDecouple) doAlchDecouple = true;
2969  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
2970  }
2971 
2972  submitHalf(reduction, numAtomsHome, 1, doEnergy || doVirial);
2973 
2974  // Updating numerical flags
2975  NAMD_EVENT_START(1, NamdProfileEvent::CPY_PATCHFLAGS);
2976  this->update_patch_flags();
2977  NAMD_EVENT_STOP(1, NamdProfileEvent::CPY_PATCHFLAGS);
2978 
2979  finish_part1(copyIn, patchList[0]->flags.savePairlists,
2980  patchList[0]->flags.usePairlists, reduction);
2981 }
2982 
2983 
2984 void SequencerCUDA::launch_set_compute_positions() {
2985 
2986  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
2988  // We need to find doNbond and doSlow for upcoming step
2989  bool doNbond = patchData->flags.doNonbonded;
2990  bool doSlow = patchData->flags.doFullElectrostatics;
2991 
2992  bool doFEP = false;
2993  bool doTI = false;
2994  bool doAlchDecouple = false;
2995  bool doAlchSoftCore = false;
2996  if (simParams->alchOn) {
2997  if (simParams->alchFepOn) doFEP = true;
2998  if (simParams->alchThermIntOn) doTI = true;
2999  if (simParams->alchDecouple) doAlchDecouple = true;
3000  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3001  }
3002  bool doGlobal = simParams->tclForcesOn || simParams->colvarsOn;
3003  // Set the positions of lone pairs before copying to NB buffers
3004  if (Node::Object()->molecule->is_lonepairs_psf) {
3005  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3006  }
3007 
3008  bool usePatchPme = false;
3009  if (deviceCUDA->getIsPmeDevice() && doSlow) {
3010  // This will check if the current lattice is compatible with the patch-level PME kernels
3011  // This needs to be redone every time the lattice changes, the results are stored in
3012  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3013  // function.
3014  //
3015  // The behavior of set compute positions PME is different depending on the kernels being used,
3016  // so that value needs to be passed to the kernel object
3018  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3020  myLattice,
3021  getNumPatchesHome(),
3022  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3023  getHostPatchMin(),
3024  getHostPatchMax(),
3025  getHostAwayDists()
3026  );
3027  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3028  }
3029 
3030  if (1) {
3031  //fprintf(stderr, "calling set_compute_positions() ****************************************\n");
3032  //fprintf(stderr, "calling set_compute_positions\n");
3033  //fprintf(stderr, "doNbond=%d doSlow=%d\n", doNbond, doSlow);
3034  std::vector<int> atom_counts;
3035  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3036  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3037  }
3038  CUDASequencerKernel->set_compute_positions(
3039  deviceIndex,
3041  nDevices,
3042  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3043  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3044 #ifdef NAMD_NCCL_ALLREDUCE
3045  (mGpuOn) ? d_posNew_x: d_pos_x,
3046  (mGpuOn) ? d_posNew_y: d_pos_y,
3047  (mGpuOn) ? d_posNew_z: d_pos_z,
3048 #else
3049  d_pos_x,
3050  d_pos_y,
3051  d_pos_z,
3052  d_peer_pos_x, // passes double-pointer if mgpuOn
3053  d_peer_pos_y,
3054  d_peer_pos_z,
3055  d_peer_charge,
3056  d_peer_partition,
3057 #endif
3058  d_charge, d_partition, charge_scaling,
3059  d_patchCenter,
3060  patchData->devData[deviceIndex].slow_patchPositions,
3061  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3062  d_sortOrder, myLattice,
3063  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3064  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3066  patchData->devData[deviceIndex].d_localPatches,
3067  patchData->devData[deviceIndex].d_peerPatches,
3068  atom_counts,
3069  stream);
3070  // For global forces, copy the coordinate to host with kernel overlap
3071  if (doGlobal) {
3072  NAMD_EVENT_START(1, NamdProfileEvent::GM_CPY_POSITION);
3073  // CkPrintf("WARNING this probably needs to be changed for multihost\n");
3074  copyPositionsToHost_direct();
3075  //copyPositionsAndVelocitiesToHost(1,0);
3076  NAMD_EVENT_STOP(1, NamdProfileEvent::GM_CPY_POSITION);
3077  }
3078  }
3079 }
3080 
3081 void SequencerCUDA:: finish_part1( const int copyIn,
3082  const int savePairlists,
3083  const int usePairlists,
3084  NodeReduction* reduction)
3085 {
3086  // JM: If we're not in a migration step, let's overlap the flagging the
3087  // positions before we synchronize the stream to lessen the compute
3088  // launch overhead
3089  // Hopefully we will see some overlap in this region
3090  //
3091  // TODO: We can just call this a different function and start calling positionsReady
3092  // if we're not on a migration step, so that we can overlap some of the work with the kernels
3093  // before we synchronize, we can clear the device memory
3094  // sets the tileListStat for the nbondKernel
3095  cudaCheck(cudaStreamSynchronize(stream));
3096 
3097  // Checks if periodic cell became too small
3098  if(*h_periodicCellSmall){
3099  NAMD_die("Periodic cell has become too small for original patch grid!\n"
3100  "Possible solutions are to restart from a recent checkpoint,\n"
3101  "increase margin, or disable useFlexibleCell for liquid simulation.");
3102  }
3103 
3104  if (killme[0]) {
3105  // Found at least one atom that is moving too fast.
3106  // Terminating, so loop performance below doesn't matter.
3107  // Loop does not vectorize
3108  double *vel_x, *vel_y, *vel_z;
3109  allocate_host<double>(&vel_x, numAtomsHome);
3110  allocate_host<double>(&vel_y, numAtomsHome);
3111  allocate_host<double>(&vel_z, numAtomsHome);
3112  copy_DtoH<double>(d_vel_x, vel_x, numAtomsHome);
3113  copy_DtoH<double>(d_vel_y, vel_y, numAtomsHome);
3114  copy_DtoH<double>(d_vel_z, vel_z, numAtomsHome);
3115  int cnt = 0;
3116  for (int i=0; i < numAtomsHome; i++) {
3117  BigReal vel2 =
3118  vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3119  if (vel2 > maxvel2) {
3120  ++cnt;
3121  iout << iERROR << " velocity is "
3122  << PDBVELFACTOR * vel_x[i] << " "
3123  << PDBVELFACTOR * vel_y[i] << " "
3124  << PDBVELFACTOR * vel_z[i]
3125  << " (limit is "
3126  << ( PDBVELFACTOR * sqrt(maxvel2) ) << ", atom "
3127  << i << " of " << numAtomsHome
3128  << " pe " << CkMyPe() << ")\n" << endi;
3129  }
3130  }
3131  iout << iERROR << "Atoms moving too fast at timestep " << patchList[0]->flags.step <<
3132  "; simulation has become unstable ("
3133  << cnt << " atoms on pe " << CkMyPe() << ").\n" << endi;
3134  deallocate_host<double>(&vel_x);
3135  deallocate_host<double>(&vel_y);
3136  deallocate_host<double>(&vel_z);
3137  NAMD_die("SequencerCUDA: Atoms moving too fast");
3138  }
3139  else{
3140  // submitHalf reductions
3141  Tensor reduction_virial;
3142  Tensor reduction_intVirialNormal;
3143  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3144  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3145  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3146  // Haochuan: the tensor is not symmetric when there are fixed atoms
3147  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3148  reduction_virial *= 0.5;
3149  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3150  // fprintf(stderr, "GPU calculated internal kinetic energy = %lf\n", intKineticEnergy_half);
3152  += (intKineticEnergy_half[0] * 0.25);
3153  reduction_intVirialNormal *= 0.5;
3154  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3155  reduction_intVirialNormal);
3156  int migration = (h_marginViolations[0] != 0) ? 1 :0; // flags migration as TRUE if margin violation occured
3157  // if(migration != 0 ) fprintf(stderr, "DEV[%d] = MIGRATION[%d]\n", deviceID, migration);
3158  patchData->migrationFlagPerDevice[deviceIndex] = migration; // Saves the updated migration flag
3159  h_marginViolations[0] = 0;
3160  }
3161 }
3162 
3163 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
3164  bool copyOut, const int doGlobal){
3165  // CkPrintf("copy positions and velocities to host copyout %d doGlobal %d\n", copyOut, doGlobal);
3166  if(copyOut){
3167  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3168  patchData = cpdata.ckLocalBranch();
3169  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3170  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3171  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3172  const int numAtomsToCopy = numAtomsHome;
3173  copy_DtoH<double>(d_vel_x, vel_x, numAtomsToCopy, stream);
3174  copy_DtoH<double>(d_vel_y, vel_y, numAtomsToCopy, stream);
3175  copy_DtoH<double>(d_vel_z, vel_z, numAtomsToCopy, stream);
3176  if (!doGlobal) {
3177  // We already copied coordinate if we have global forces
3178  copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3179  copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3180  copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3181  }
3182  cudaCheck(cudaDeviceSynchronize());
3183 
3184  for(int i = 0; i < homePatches.size(); i++){
3185 
3186  // TODO do we need to copy proxy patches as well
3187  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3188  const int numPatchAtoms = localPatches[i].numAtoms;
3189  const int offset = localPatches[i].bufferOffset;
3190  memcpy(current.vel_x, vel_x + offset, numPatchAtoms*sizeof(double));
3191  memcpy(current.vel_y, vel_y + offset, numPatchAtoms*sizeof(double));
3192  memcpy(current.vel_z, vel_z + offset, numPatchAtoms*sizeof(double));
3193  if (!doGlobal) {
3194  // We already copied coordinate if we have global forces
3195  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3196  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3197  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3198  }
3199  }
3200  }
3201 }
3202 
3203 
3204 void SequencerCUDA::copyPositionsToHost(){
3205  // CkPrintf("copy positions to host \n");
3206  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3207  patchData = cpdata.ckLocalBranch();
3208  std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3209  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3210  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3211 
3212  const int numAtomsToCopy = numAtomsHome;
3213  // We already copied coordinate if we have global forces
3214  copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3215  copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3216  copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3217  cudaCheck(cudaDeviceSynchronize());
3218 
3219  for(int i = 0; i < homePatches.size(); i++){
3220 
3221  // TODO do we need to copy proxy patches as well
3222  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3223  const int numPatchAtoms = localPatches[i].numAtoms;
3224  const int offset = localPatches[i].bufferOffset;
3225  memcpy(current.pos_x, pos_x + offset, numPatchAtoms*sizeof(double));
3226  memcpy(current.pos_y, pos_y + offset, numPatchAtoms*sizeof(double));
3227  memcpy(current.pos_z, pos_z + offset, numPatchAtoms*sizeof(double));
3228  }
3229 }
3230 
3231 void SequencerCUDA::update_patch_flags()
3232 {
3233  // int pairlists = 1;
3234  int pairlists = (patchData->flags.step < simParams->N);
3235  for (int i=0; i < numPatchesHome; i++) {
3236  HomePatch *patch = patchList[i];
3237  patch->flags.copyIntFlags(patchData->flags); // copy global flags
3238  }
3239 }
3240 
3241 void SequencerCUDA::updatePairlistFlags(const int doMigration){
3242  int pairlists = patchList[0]->flags.step < simParams->N;
3243  for(int i = 0; i < numPatchesHome; i++){
3244  //for(int i = 0; i < numPatches; i++){
3245  HomePatch *patch = patchList[i];
3246  Sequencer *seq = patch->sequencer;
3247  // the following logic is duplicated from Sequencer::runComputeObjects
3248  // Migrations always invalidates pairlists
3249  if (doMigration) {
3250  seq->pairlistsAreValid = 0;
3251  }
3252  if (seq->pairlistsAreValid &&
3253  ( patch->flags.doFullElectrostatics || ! simParams->fullElectFrequency )
3254  && (seq->pairlistsAge > seq->pairlistsAgeLimit) ) {
3255  seq->pairlistsAreValid = 0;
3256  }
3257  patch->flags.usePairlists = pairlists || seq->pairlistsAreValid;
3258  patch->flags.savePairlists = pairlists && !seq->pairlistsAreValid;
3259  if(patch->flags.savePairlists){
3260  // We need to rebuild pairlists -> reset tolerance values
3261  patch->flags.pairlistTolerance = patchList[i]->doPairlistCheck_newTolerance; // update pairListTolerance
3262  patch->flags.maxAtomMovement = 0;
3263  patch->doPairlistCheck_newTolerance *= (1 - simParams->pairlistShrink);
3264  }else if(patch->flags.usePairlists){
3265  // We can keep going with the existing pairlists -> update tolerances
3266  patch->flags.maxAtomMovement = patchMaxAtomMovement[i];
3267  patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
3268  }else{
3269  // End of simulation
3270  patch->flags.maxAtomMovement=99999.;
3271  patch->flags.pairlistTolerance = 0.;
3272  }
3273  }
3274  if(patchList[0]->flags.savePairlists){
3275  // Backs up d_posSave_* for pairlistCheck
3276  copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
3277  copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
3278  copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
3279  myLatticeOld = myLattice;
3280  }
3281 }
3282 
3283 void SequencerCUDA::finish_patch_flags(int migration)
3284 {
3285  for (int i=0; i < numPatchesHome; i++) {
3286  HomePatch *patch = patchList[i];
3287  Sequencer *seq = patch->sequencer;
3288  if (patch->flags.savePairlists && patch->flags.doNonbonded) {
3289  seq->pairlistsAreValid = 1;
3290  seq->pairlistsAge = 0;
3291  }
3292  if (seq->pairlistsAreValid /* && ! pressureStep */) {
3293  ++(seq->pairlistsAge);
3294  }
3295  }
3296 }
3297 
3298 
3299 void SequencerCUDA::launch_part2(
3300  const int doMCPressure,
3301  double dt_normal,
3302  double dt_nbond,
3303  double dt_slow,
3304  NodeReduction *reduction,
3305  Vector &origin,
3306  int step,
3307  int maxForceNumber,
3308  const int langevinPistonStep,
3309  const int copyIn,
3310  const int copyOut,
3311  const int doGlobal,
3312  const bool doEnergy)
3313 {
3314  PatchMap* patchMap = PatchMap::Object();
3315  Tensor localVirial;
3316  //cudaTensor h_rigidVirial;
3317  bool doNbond = false;
3318  bool doSlow = false;
3319  cudaCheck(cudaSetDevice(deviceID));
3320  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3321  const int is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3322  // const int doVirial = langevinPistonStep;
3323  // JM: For launch_part2:
3324  // copyIn = migration steps
3325 
3326  reduction->item(REDUCTION_ATOM_CHECKSUM) += numAtomsHome;
3327 
3328  if(mGpuOn){
3329 #ifdef NAMD_NCCL_ALLREDUCE
3330  cudaCheck(cudaMemset(d_f_raw, 0, sizeof(double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
3331 #endif
3332  }
3333 
3334  if(!simParams->langevinOn && !simParams->eFieldOn && !simParams->constraintsOn &&
3335  !simParams->SMDOn && !simParams->groupRestraintsOn && !doMCPressure &&
3336  !simParams->mgridforceOn && ! simParams->gridforceOn && !mGpuOn &&
3337  !simParams->consForceOn &&
3338  (simParams->watmodel == WaterModel::TIP3) &&
3339  (!is_lonepairs_psf)){
3340  CUDASequencerKernel->accumulate_force_kick(
3341  simParams->fixedAtomsOn,
3342  doGlobal,
3343  simParams->useCudaGlobal,
3344  maxForceNumber,
3345  numPatchesHomeAndProxy,
3346  patchData->devData[deviceIndex].d_localPatches,
3347  patchData->devData[deviceIndex].f_bond,
3348  patchData->devData[deviceIndex].f_bond_nbond,
3349  patchData->devData[deviceIndex].f_bond_slow,
3350  patchData->devData[deviceIndex].forceStride,
3351  patchData->devData[deviceIndex].f_nbond,
3352  patchData->devData[deviceIndex].f_nbond_slow,
3353  patchData->devData[deviceIndex].f_slow,
3354  d_f_global_x,
3355  d_f_global_y,
3356  d_f_global_z,
3357  d_f_normal_x,
3358  d_f_normal_y,
3359  d_f_normal_z,
3360  d_f_nbond_x,
3361  d_f_nbond_y,
3362  d_f_nbond_z,
3363  d_f_slow_x,
3364  d_f_slow_y,
3365  d_f_slow_z,
3366  d_vel_x,
3367  d_vel_y,
3368  d_vel_z,
3369  d_recipMass,
3370  d_atomFixed,
3371  dt_normal,
3372  dt_nbond,
3373  dt_slow,
3374  1.0,
3375  d_unsortOrder,
3376  myLattice,
3377  stream
3378  );
3379  }else{
3380  CUDASequencerKernel->accumulateForceToSOA(
3381  doGlobal,
3382  simParams->useCudaGlobal,
3383  maxForceNumber,
3384  numPatchesHomeAndProxy,
3385  nDevices,
3386  patchData->devData[deviceIndex].d_localPatches,
3387  patchData->devData[deviceIndex].f_bond,
3388  patchData->devData[deviceIndex].f_bond_nbond,
3389  patchData->devData[deviceIndex].f_bond_slow,
3390  patchData->devData[deviceIndex].forceStride,
3391  patchData->devData[deviceIndex].f_nbond,
3392  patchData->devData[deviceIndex].f_nbond_slow,
3393  patchData->devData[deviceIndex].f_slow,
3394  d_f_global_x,
3395  d_f_global_y,
3396  d_f_global_z,
3397  d_f_normal_x,
3398  d_f_normal_y,
3399  d_f_normal_z,
3400  d_f_nbond_x,
3401  d_f_nbond_y,
3402  d_f_nbond_z,
3403  d_f_slow_x,
3404  d_f_slow_y,
3405  d_f_slow_z,
3406  d_unsortOrder,
3407  myLattice,
3408  patchData->d_queues,
3409  patchData->d_queueCounters,
3410  d_tbcatomic,
3411  stream
3412  );
3413 
3414  }
3415 
3416  if (mGpuOn) {
3417  // Synchonize device before node barrier
3418  cudaCheck(cudaDeviceSynchronize());
3419  }
3420 }
3421 
3422 // launch_part2 is broken into 2 part to support MC barostat
3423 void SequencerCUDA::launch_part3(
3424  const int doMCPressure,
3425  double dt_normal,
3426  double dt_nbond,
3427  double dt_slow,
3428  NodeReduction *reduction,
3429  Vector &origin,
3430  int step,
3431  int maxForceNumber,
3432  const bool requestGlobalForces,
3433  const int doGlobalStaleForces,
3434  const bool forceRequestedGPU,
3435  const int copyIn,
3436  const int copyOut,
3437  const bool doEnergy,
3438  const bool requestForcesOutput)
3439 {
3440  const int doVirial = simParams->langevinPistonOn || simParams->berendsenPressureOn;
3441  const bool doFixed = simParams->fixedAtomsOn;
3442  const double velrescaling = 1; // no rescaling
3443 
3444  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3445  simParams->SMDOn || simParams->groupRestraintsOn || requestGlobalForces || simParams->gridforceOn|| simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3446  (simParams->watmodel != WaterModel::TIP3) ||
3448  if(mGpuOn){
3449 #ifndef NAMD_NCCL_ALLREDUCE
3450  // JM - Awful: We need to busy wait inside accumulateForceToSOA instead
3451  //ncclBroadcast(d_barrierFlag, d_barrierFlag, 1, ncclChar,
3452  // 0, deviceCUDA->getNcclComm(), stream);
3453 
3454  std::vector<int> atom_counts;
3455  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3456  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3457  }
3458  CUDASequencerKernel->mergeForcesFromPeers(
3459  deviceIndex,
3460  maxForceNumber,
3461  myLattice,
3462  numPatchesHomeAndProxy,
3463  numPatchesHome,
3464  this->d_peer_fb_x,
3465  this->d_peer_fb_y,
3466  this->d_peer_fb_z,
3467  this->d_peer_fn_x,
3468  this->d_peer_fn_y,
3469  this->d_peer_fn_z,
3470  this->d_peer_fs_x,
3471  this->d_peer_fs_y,
3472  this->d_peer_fs_z,
3473  // patchData->devData[deviceCUDA->getPmeDevice()].f_slow,
3474  patchData->devData[deviceCUDA->getPmeDeviceIndex()].f_slow,
3475  patchData->devData[deviceIndex].d_localPatches,
3476  patchData->devData[deviceIndex].d_peerPatches,
3477  atom_counts,
3478  stream
3479  );
3480 #else
3481  int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
3482  ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum, deviceCUDA->getNcclComm(), stream );
3483 #endif
3484  }
3485  if(doVirial && doGlobalStaleForces)
3486  {
3487  memset(&extVirial[EXT_GLOBALMTS], 0, sizeof(cudaTensor));
3488  memset(&extForce[EXT_GLOBALMTS], 0, sizeof(double3));
3489  computeGlobalMasterVirial(
3490  numPatchesHomeAndProxy,
3491  patchData->devData[deviceIndex].d_localPatches,
3492  d_pos_x,
3493  d_pos_y,
3494  d_pos_z,
3495  d_transform,
3496  f_global_x,
3497  f_global_y,
3498  f_global_z,
3499  &d_extForce[EXT_GLOBALMTS],
3500  &extForce[EXT_GLOBALMTS],
3501  &d_extVirial[EXT_GLOBALMTS],
3502  &extVirial[EXT_GLOBALMTS],
3503  myLattice,
3504  d_tbcatomic,
3505  stream);
3506  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
3507  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
3508  }
3509  calculateExternalForces(step, reduction, maxForceNumber, doEnergy, doVirial);
3510 #if 0
3511  cudaCheck(cudaDeviceSynchronize());
3512  if(true || deviceID == 0){
3513  char prefix[10];
3514  snprintf(prefix, 10, "step-%d",step);
3515  this->printSOAForces(prefix);
3516  }
3517 #endif
3518 
3519  }
3520 
3521  if (simParams->langevinOn) {
3522  CUDASequencerKernel->langevinVelocitiesBBK1(
3523  dt_normal, d_langevinParam, d_vel_x, d_vel_y, d_vel_z, numAtomsHome, stream);
3524  }
3525 
3526  if(simParams->langevinOn || simParams->eFieldOn || simParams->constraintsOn ||
3527  simParams->SMDOn || simParams->groupRestraintsOn || doMCPressure || simParams->gridforceOn || simParams->mgridforceOn || mGpuOn || simParams->consForceOn ||
3528  (simParams->watmodel != WaterModel::TIP3) ||
3530  CUDASequencerKernel->addForceToMomentum(
3531  doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
3532  d_recipMass,
3533  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3534  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3535  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3536  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3537  numAtomsHome, maxForceNumber, stream);
3538  }
3539 
3540  if (simParams->langevinOn) {
3541 
3542  // must enforce rigid bond constraints if langevin gammas differ
3543  if (simParams->rigidBonds != RIGID_NONE &&
3544  simParams->langevinGammasDiffer) {
3545  CUDASequencerKernel->rattle1(
3546  simParams->fixedAtomsOn, doEnergy || doVirial,
3547  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3548  2.0 * simParams->rigidTol,
3549  d_vel_x, d_vel_y, d_vel_z,
3550  d_pos_x, d_pos_y, d_pos_z,
3551  d_velNew_x, d_velNew_y, d_velNew_z,
3552  d_posNew_x, d_posNew_y, d_posNew_z,
3553  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3554  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3555  &settleList, settleListSize, &d_consFailure,
3556  d_consFailureSize, &rattleList, rattleListSize,
3557  &nSettle, &nRattle,
3558  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3559  buildRigidLists, consFailure, simParams->watmodel, stream);
3560  buildRigidLists = false;
3561  }
3562  CUDASequencerKernel->langevinVelocitiesBBK2(
3563  dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
3564  d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
3565  d_vel_x, d_vel_y, d_vel_z,
3566  numAtomsHome, numAtomsHome, 0,
3567  curandGen, stream);
3568  }
3569  if(simParams->rigidBonds != RIGID_NONE){
3570  CUDASequencerKernel->rattle1(
3571  simParams->fixedAtomsOn, doEnergy || doVirial,
3572  1, numAtomsHome, dt_normal, 1.0/dt_normal,
3573  2.0 * simParams->rigidTol,
3574  d_vel_x, d_vel_y, d_vel_z,
3575  d_pos_x, d_pos_y, d_pos_z,
3576  d_velNew_x, d_velNew_y, d_velNew_z,
3577  d_posNew_x, d_posNew_y, d_posNew_z,
3578  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3579  d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3580  &settleList, settleListSize, &d_consFailure,
3581  d_consFailureSize, &rattleList, rattleListSize,
3582  &nSettle, &nRattle,
3583  d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3584  buildRigidLists, consFailure, simParams->watmodel, stream);
3585  buildRigidLists = false;
3586  }
3587 
3588  // Update velocity center of mass here
3589  if(doEnergy || doVirial){
3590  CUDASequencerKernel->centerOfMass(
3591  d_vel_x, d_vel_y, d_vel_z,
3592  d_vcm_x, d_vcm_y, d_vcm_z,
3593  d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
3594  }
3595 
3596  submitHalf(reduction, numAtomsHome, 2, doEnergy || doVirial);
3597 
3598  CUDASequencerKernel->addForceToMomentum(
3599  doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
3600  d_recipMass,
3601  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3602  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3603  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3604  d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3605  numAtomsHome, maxForceNumber, stream);
3606 
3607  if(requestGlobalForces || requestForcesOutput) {
3608  // store the forces for next step,
3609  // when we need it for colvars and Tcl scripting
3610  saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
3611  }
3612 
3613  if (forceRequestedGPU) {
3614  if (d_f_saved_nbond_x == nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
3615  if (d_f_saved_nbond_y == nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
3616  if (d_f_saved_nbond_z == nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
3617  if (d_f_saved_slow_x == nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
3618  if (d_f_saved_slow_y == nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
3619  if (d_f_saved_slow_z == nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
3620  CUDASequencerKernel->copyForcesToDevice(
3621  numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3622  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3623  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
3624  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
3625  // cudaCheck(cudaStreamSynchronize(stream));
3626  }
3627 
3628  //cudaCheck(cudaStreamSynchronize(stream));
3629  submitReductions(reduction, origin.x, origin.y, origin.z,
3630  marginViolations, doEnergy || doVirial,
3631  copyOut && simParams->outputMomenta != 0,
3632  numAtomsHome, maxForceNumber);
3633 
3634  // This is for collecting coordinate and velocity to print
3635  copyPositionsAndVelocitiesToHost(copyOut, 0);
3636 
3637  if(consFailure[0]){
3638  // Constraint failure. Abort.
3639  int dieOnError = simParams->rigidDie;
3640  if(dieOnError){
3641  // Bails out
3642  //iout << iWARN << "constraint failure during GPU integration \n" << endi;
3643  NAMD_die("constraint failure during CUDA rattle!\n");
3644  }else{
3645  iout << iWARN << "constraint failure during CUDA rattle!\n" << endi;
3646  }
3647  }else if(doEnergy || doVirial){
3648  cudaCheck(cudaStreamSynchronize(stream));
3649  if(simParams->rigidBonds != RIGID_NONE){
3650  Tensor reduction_rigidVirial;
3651  COPY_CUDATENSOR(rigidVirial[0], reduction_rigidVirial);
3652  // Haochuan: the tensor is not symmetric when there are fixed atoms
3653  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
3654  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL, reduction_rigidVirial);
3655  }
3656 
3657  // SUBMITHALF reductions
3658  Tensor reduction_virial;
3659  Tensor reduction_intVirialNormal;
3660  COPY_CUDATENSOR(virial_half[0], reduction_virial);
3661  COPY_CUDATENSOR(intVirialNormal_half[0], reduction_intVirialNormal);
3662  reduction->item(REDUCTION_HALFSTEP_KINETIC_ENERGY) += (kineticEnergy_half[0] * 0.25);
3663  // Haochuan: the tensor is not symmetric when there are fixed atoms
3664  if (!simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3665  reduction_virial *= 0.5;
3666  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NORMAL,reduction_virial);
3667 
3669  += (intKineticEnergy_half[0] * 0.25);
3670  reduction_intVirialNormal *= 0.5;
3671  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL,
3672  reduction_intVirialNormal);
3673 
3674  //submitReductions1
3675  reduction->item(REDUCTION_CENTERED_KINETIC_ENERGY) += (kineticEnergy[0] * 0.5);
3676  Vector momentum(*momentum_x, *momentum_y, *momentum_z);
3677  ADD_VECTOR_OBJECT(reduction,REDUCTION_MOMENTUM,momentum);
3678  Vector angularMomentum(*angularMomentum_x,
3679  *angularMomentum_y,
3680  *angularMomentum_z);
3681  ADD_VECTOR_OBJECT(reduction,REDUCTION_ANGULAR_MOMENTUM,angularMomentum);
3682  //submitReductions2
3683  Tensor regintVirialNormal;
3684  Tensor regintVirialNbond;
3685  Tensor regintVirialSlow;
3686  COPY_CUDATENSOR(intVirialNormal[0], regintVirialNormal);
3687  if (maxForceNumber >= 1) {
3688  COPY_CUDATENSOR(intVirialNbond[0], regintVirialNbond);
3689  }
3690  if (maxForceNumber >= 2) {
3691  COPY_CUDATENSOR(intVirialSlow[0], regintVirialSlow);
3692  }
3693 
3694  reduction->item(REDUCTION_INT_CENTERED_KINETIC_ENERGY) += (intKineticEnergy[0] * 0.5);
3695  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NORMAL, regintVirialNormal);
3696  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_NBOND, regintVirialNbond);
3697  ADD_TENSOR_OBJECT(reduction, REDUCTION_INT_VIRIAL_SLOW, regintVirialSlow);
3698 
3699  if (simParams->fixedAtomsOn) {
3700  cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
3701  double3 fixForceNormal, fixForceNbond, fixForceSlow;
3702  switch (maxForceNumber) {
3703  case 2: {
3704  copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
3705  copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
3706  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, fixVirialSlow);
3707  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_SLOW, fixForceSlow);
3708  cudaCheck(cudaMemset(d_fixVirialSlow, 0, 1 * sizeof(cudaTensor)));
3709  cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 * sizeof(double3)));
3710  } // intentionally fallthrough
3711  case 1: {
3712  copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
3713  copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
3714  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, fixVirialNbond);
3715  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NBOND, fixForceNbond);
3716  cudaCheck(cudaMemset(d_fixVirialNbond, 0, 1 * sizeof(cudaTensor)));
3717  cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 * sizeof(double3)));
3718  } // intentionally fallthrough
3719  default: {
3720  copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
3721  copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
3722  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, fixVirialNormal);
3723  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, fixForceNormal);
3724  cudaCheck(cudaMemset(d_fixVirialNormal, 0, 1 * sizeof(cudaTensor)));
3725  cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 * sizeof(double3)));
3726  }
3727  }
3728 #if 0
3729  auto printTensor = [](const cudaTensor& t, const std::string& name){
3730  CkPrintf("%s", name.c_str());
3731  CkPrintf("\n%12.5lf %12.5lf %12.5lf\n"
3732  "%12.5lf %12.5lf %12.5lf\n"
3733  "%12.5lf %12.5lf %12.5lf\n",
3734  t.xx, t.xy, t.xz,
3735  t.yx, t.yy, t.yz,
3736  t.zx, t.zy, t.zz);
3737  };
3738  printTensor(fixVirialNormal, "fixVirialNormal = ");
3739  printTensor(fixVirialNbond, "fixVirialNbond = ");
3740  printTensor(fixVirialSlow, "fixVirialSlow = ");
3741 #endif
3742  }
3743  }
3744 }
3745 
3746 // Adding this function back temporarily until GPU migration is merged
3747 #if 1
3748 // This function will aggregate data within a single GPU, so we need to have it copied over pmePositions
3749 void SequencerCUDA::atomUpdatePme()
3750 {
3751  const double charge_scaling = sqrt(COULOMB * ComputeNonbondedUtil::scaling *
3753  // We need to find doNbond and doSlow for upcoming step
3754  bool doNbond = false;
3755  bool doSlow = true;
3756 
3757  bool doFEP = false;
3758  bool doTI = false;
3759  bool doAlchDecouple = false;
3760  bool doAlchSoftCore = false;
3761  if (simParams->alchOn) {
3762  if (simParams->alchFepOn) doFEP = true;
3763  if (simParams->alchThermIntOn) doTI = true;
3764  if (simParams->alchDecouple) doAlchDecouple = true;
3765  if (simParams->alchElecLambdaStart > 0) doAlchSoftCore = true;
3766  }
3767 
3768  if (Node::Object()->molecule->is_lonepairs_psf) {
3769  lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3770  }
3771 
3772  bool usePatchPme = false;
3773  if (deviceCUDA->getIsPmeDevice() && doSlow) {
3774  // This will check if the current lattice is compatible with the patch-level PME kernels
3775  // This needs to be redone every time the lattice changes, the results are stored in
3776  // the cudaPme object, and the overall compatibility can be computed with the compatible()
3777  // function.
3778  //
3779  // The behavior of set compute positions PME is different depending on the kernels being used,
3780  // so that value needs to be passed to the kernel object
3782  CudaPmeOneDevice* cudaPme = cudaMgr->getCudaPmeOneDevice();
3784  myLattice,
3785  getNumPatchesHome(),
3786  patchData->devData[deviceCUDA->getDeviceIndex()].d_localPatches,
3787  getHostPatchMin(),
3788  getHostPatchMax(),
3789  getHostAwayDists()
3790  );
3791  usePatchPme = cudaPme->patchLevelPmeData.compatible();
3792  }
3793 
3794  std::vector<int> atom_counts;
3795  for (int i = 0; i < deviceCUDA->getDeviceCount(); i++) {
3796  atom_counts.push_back(patchData->devData[i].numAtomsHome);
3797  }
3798  CUDASequencerKernel->set_pme_positions(
3799  deviceIndex,
3801  nDevices,
3802  numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3803  doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3804 #ifdef NAMD_NCCL_ALLREDUCE
3805  (mGpuOn) ? d_posNew_x: d_pos_x,
3806  (mGpuOn) ? d_posNew_y: d_pos_y,
3807  (mGpuOn) ? d_posNew_z: d_pos_z,
3808 #else
3809  d_pos_x,
3810  d_pos_y,
3811  d_pos_z,
3812  d_peer_pos_x, // passes double-pointer if mgpuOn
3813  d_peer_pos_y,
3814  d_peer_pos_z,
3815  d_peer_charge,
3816  d_peer_partition,
3817 #endif
3818  d_charge, d_partition, charge_scaling,
3819  d_patchCenter,
3820  patchData->devData[deviceIndex].slow_patchPositions,
3821  patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3822  d_sortOrder, myLattice,
3823  (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3824  (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3826  patchData->devData[deviceIndex].d_localPatches,
3827  patchData->devData[deviceIndex].d_peerPatches,
3828  atom_counts,
3829  stream);
3830 
3831  cudaCheck(cudaStreamSynchronize(stream));
3832 }
3833 #endif
3834 
3835 
3836 void SequencerCUDA::sync() {
3837  cudaCheck(cudaStreamSynchronize(stream));
3838 }
3839 
3840 void SequencerCUDA::calculateExternalForces(
3841  const int step,
3842  NodeReduction *reduction,
3843  const int maxForceNumber,
3844  const int doEnergy,
3845  const int doVirial) {
3846 
3847  const bool is_lonepairs_psf = Node::Object()->molecule->is_lonepairs_psf;
3848  const bool is_tip4_water = simParams->watmodel == WaterModel::TIP4;
3849 
3850  if (is_lonepairs_psf) {
3851  lonepairsKernel->redistributeForce(
3852  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3853  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3854  d_f_slow_x, d_f_slow_y, d_f_slow_z,
3855  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
3856  d_pos_x, d_pos_y, d_pos_z, maxForceNumber, doEnergy || doVirial, stream);
3857  }
3858  // TODO: Should I use "else if" here to follow the logic of Sequencer.C?
3859  if (is_tip4_water) {
3860  redistributeTip4pForces(reduction, maxForceNumber, doEnergy || doVirial);
3861  }
3862 
3863  if(simParams->eFieldOn){
3864  double3 efield;
3865  efield.x = simParams->eField.x;
3866  efield.y = simParams->eField.y;
3867  efield.z = simParams->eField.z;
3868 
3869  double efield_omega = TWOPI * simParams->eFieldFreq / 1000.;
3870  double efield_phi = PI/180. * simParams->eFieldPhase;
3871  double t = step * simParams->dt;
3872 
3873  CUDASequencerKernel->apply_Efield(numAtomsHome, simParams->eFieldNormalized,
3874  doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
3875  d_charge, d_pos_x, d_pos_y, d_pos_z,
3876  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3877  &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
3878  &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
3879  &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
3880  d_tbcatomic, stream);
3881  }
3882 
3883  if(simParams->constraintsOn){
3884  restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
3885  d_pos_x, d_pos_y, d_pos_z,
3886  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3887  &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
3888  &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
3889  &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
3890  }
3891 
3892  if(simParams->SMDOn){
3893  SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
3894  d_mass, d_pos_x, d_pos_y, d_pos_z, d_transform,
3895  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3896  &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
3897  &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
3898  }
3899 
3900  if(simParams->groupRestraintsOn){
3901  groupRestraintsKernel->doForce(step, doEnergy, doVirial,
3902  myLattice, d_transform,
3903  d_mass, d_pos_x, d_pos_y, d_pos_z,
3904  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3905  &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
3906  &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
3907  }
3908  if(simParams->mgridforceOn || simParams->gridforceOn){
3909  gridForceKernel->doForce(doEnergy, doVirial,
3910  myLattice, step,
3911  d_pos_x, d_pos_y, d_pos_z, d_transform,
3912  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3913  stream);
3914  }
3915  if(simParams->consForceOn){
3916  consForceKernel->doForce(myLattice, doVirial,
3917  d_pos_x, d_pos_y, d_pos_z,
3918  d_f_normal_x, d_f_normal_y, d_f_normal_z,
3919  d_transform,
3920  &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
3921  &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
3922  }
3923 
3924  if(doEnergy || doVirial) {
3925  // Store the external forces and energy data
3926  cudaCheck(cudaStreamSynchronize(stream));
3927  if (is_lonepairs_psf || is_tip4_water) {
3928  switch (maxForceNumber) {
3929  case 2:
3930  copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
3931  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, lpVirialSlow[0]);
3932  cudaCheck(cudaMemset(d_lpVirialSlow, 0, 1 * sizeof(cudaTensor)));
3933  case 1:
3934  copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
3935  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, lpVirialNbond[0]);
3936  cudaCheck(cudaMemset(d_lpVirialNbond, 0, 1 * sizeof(cudaTensor)));
3937  case 0:
3938  copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
3939  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, lpVirialNormal[0]);
3940  cudaCheck(cudaMemset(d_lpVirialNormal, 0, 1 * sizeof(cudaTensor)));
3941  }
3942  }
3943  if(simParams->eFieldOn){
3944  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_ELEC_FIELD];
3945  if (!simParams->eFieldNormalized){
3946  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
3947  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
3948  }
3949  }
3950 
3951  if(simParams->constraintsOn){
3952  reduction->item(REDUCTION_BC_ENERGY) += extEnergy[EXT_CONSTRAINTS];
3953  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
3954  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
3955  }
3956 
3957  if(simParams->SMDOn){
3958  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_SMD];
3959  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_SMD]);
3960  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_SMD]);
3961  }
3962 
3963  if(simParams->groupRestraintsOn){
3964  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GROUP_RESTRAINTS];
3965  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
3966  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
3967  }
3968 
3969  if(simParams->mgridforceOn || simParams->gridforceOn){
3970  // first sum across grids
3971  gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
3972  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_GRIDFORCE];
3973  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
3974  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
3975  gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
3976  }
3977  if(simParams->consForceOn){
3978  reduction->item(REDUCTION_MISC_ENERGY) += extEnergy[EXT_CONSFORCE];
3979  ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
3980  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
3981  }
3982 
3983  }
3984 }
3985 
3986 void SequencerCUDA::copyGlobalForcesToDevice(){
3987  // copy the globalMaster forces from host to device.
3988  // Use normal force on host to aggregate it
3989  std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3990  // fprintf(stderr, "PE[%d] pos/vel printout, numPatchesHome = %d\n", CkMyPe(), numPatchesHome);
3991  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3992  // TODO: determine if this aggregation needs peers and to be home and proxy
3993  for(int i =0 ; i < numPatchesHome; i++){
3994  CudaLocalRecord record = localPatches[i];
3995  const int patchID = record.patchID;
3996  const int stride = record.bufferOffset;
3997  const int numPatchAtoms = record.numAtoms;
3998  PatchDataSOA& current = homePatches[i]->patchDataSOA;
3999  memcpy(f_global_x + stride, current.f_global_x, numPatchAtoms*sizeof(double));
4000  memcpy(f_global_y + stride, current.f_global_y, numPatchAtoms*sizeof(double));
4001  memcpy(f_global_z + stride, current.f_global_z, numPatchAtoms*sizeof(double));
4002  }
4003  // copy aggregated force to device buffer
4004  copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
4005  copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
4006  copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);
4007 
4008 }
4009 
4010 void SequencerCUDA::updateHostPatchDataSOA() {
4011  std::vector<PatchDataSOA> host_copy(numPatchesHome);
4012  std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4013 
4014  for(int i =0 ; i < numPatchesHome; i++) {
4015  host_copy[i] = homePatches[i]->patchDataSOA;
4016  }
4017  copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
4018  cudaCheck(cudaDeviceSynchronize());
4019 }
4020 
4021 void SequencerCUDA::saveForceCUDASOA_direct(
4022  const bool doGlobal, const bool doForcesOutput, const int maxForceNumber) {
4023  CUDASequencerKernel->copyForcesToHostSOA(
4024  numPatchesHome,
4025  patchData->devData[deviceIndex].d_localPatches,
4026  maxForceNumber,
4027  d_f_normal_x,
4028  d_f_normal_y,
4029  d_f_normal_z,
4030  d_f_nbond_x,
4031  d_f_nbond_y,
4032  d_f_nbond_z,
4033  d_f_slow_x,
4034  d_f_slow_y,
4035  d_f_slow_z,
4036  d_HostPatchDataSOA,
4037  doGlobal,
4038  doForcesOutput,
4039  stream
4040  );
4041  cudaCheck(cudaStreamSynchronize(stream));
4042 }
4043 
4044 void SequencerCUDA::copyPositionsToHost_direct() {
4045  CUDASequencerKernel->copyPositionsToHostSOA(
4046  numPatchesHome,
4047  patchData->devData[deviceIndex].d_localPatches,
4048  d_pos_x,
4049  d_pos_y,
4050  d_pos_z,
4051  d_HostPatchDataSOA,
4052  stream
4053  );
4054  cudaCheck(cudaStreamSynchronize(stream));
4055 }
4056 
4057 void SequencerCUDA::redistributeTip4pForces(
4058  NodeReduction *reduction,
4059  const int maxForceNumber,
4060  const int doVirial) {
4061  CUDASequencerKernel->redistributeTip4pForces(
4062  d_f_normal_x, d_f_normal_y, d_f_normal_z,
4063  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
4064  d_f_slow_x, d_f_slow_y, d_f_slow_z,
4065  d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
4066  d_pos_x, d_pos_y, d_pos_z, d_mass,
4067  numAtomsHome, doVirial, maxForceNumber, stream
4068  );
4069 }
4070 
4071 void SequencerCUDA::allocateGPUSavedForces() {
4072  allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
4073  allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
4074  allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
4075  allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
4076  allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
4077  allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
4078 }
4079 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
BigReal yy
Definition: CudaUtils.h:80
double * vel_y
Definition: NamdTypes.h:387
NAMD_HOST_DEVICE void rescale(Tensor factor)
Definition: Lattice.h:60
#define NAMD_EVENT_STOP(eon, id)
int getNumAtoms() const
Definition: Patch.h:105
ScaledPosition getMin()
Definition: HomePatch.h:529
int getDeviceCount()
Definition: DeviceCUDA.h:124
int size(void) const
Definition: ResizeArray.h:131
NAMD_HOST_DEVICE Vector c() const
Definition: Lattice.h:270
void updateAtomBuffers()
int periodic_a(void) const
Definition: PatchMap.h:73
#define BOLTZMANN
Definition: common.h:54
#define curandCheck(stmt)
Definition: CudaUtils.h:242
static void partition(int *order, const FullAtom *atoms, int begin, int end)
Definition: SortAtoms.C:45
static PatchMap * Object()
Definition: PatchMap.h:27
BigReal zz
Definition: CudaUtils.h:84
BigReal yx
Definition: CudaUtils.h:79
int periodic_c(void) const
Definition: PatchMap.h:75
BigReal alchElecLambdaStart
Definition: Vector.h:72
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
void deallocate_device(T **pp)
Definition: CudaUtils.h:333
BigReal yz
Definition: CudaUtils.h:81
int savePairlists
Definition: PatchTypes.h:40
double * f_global_z
Definition: NamdTypes.h:429
int32 * moleculeAtom
atom index for all molecules
Definition: Molecule.h:620
#define COULOMB
Definition: common.h:53
HomePatchList * homePatchList()
Definition: PatchMap.C:438
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:74
int getNumDevice()
Definition: DeviceCUDA.h:125
int usePairlists
Definition: PatchTypes.h:39
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
PatchID destPatchID
Definition: Migration.h:20
std::ostream & iWARN(std::ostream &s)
Definition: InfoStream.C:82
#define iout
Definition: InfoStream.h:51
Patch * patch(PatchID pid)
Definition: PatchMap.h:244
#define COPY_CUDATENSOR(S, D)
Definition: CudaUtils.h:42
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
HomePatch * homePatch(PatchID pid)
Definition: PatchMap.h:249
PatchLevelPmeData patchLevelPmeData
Molecule stores the structural information for the system.
Definition: Molecule.h:175
double * pos_y
Definition: NamdTypes.h:368
void setupDevicePeerAccess()
Flags flags
Definition: Patch.h:128
double * f_global_y
Definition: NamdTypes.h:428
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
#define PI
Definition: common.h:92
void updateAtomCount(const int n, const int reallocate)
FullAtomList & getAtomList()
Definition: HomePatch.h:528
void copyIntFlags(const Flags &flags)
Definition: PatchTypes.h:72
NAMD_HOST_DEVICE double3 make_double3(float3 a)
Definition: Vector.h:343
int numPatches(void) const
Definition: PatchMap.h:59
#define NAMD_EVENT_START(eon, id)
int pairlistsAge
Definition: Sequencer.h:230
int getPmeDeviceIndex()
Definition: DeviceCUDA.h:167
static AtomMap * ObjectOnPe(int pe)
Definition: AtomMap.h:38
int getMasterPe()
Definition: DeviceCUDA.h:137
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
Definition: Molecule.h:618
#define ATOMIC_BINS
Definition: CudaUtils.h:70
void NAMD_bug(const char *err_msg)
Definition: common.C:195
BigReal xx
Definition: CudaUtils.h:76
#define COPY_CUDAVECTOR(S, D)
Definition: CudaUtils.h:53
static ComputeCUDAMgr * getComputeCUDAMgr()
int doFullElectrostatics
Definition: PatchTypes.h:23
ReductionValue & item(int index)
Definition: ReductionMgr.C:633
double * vel_x
Jim recommends double precision velocity.
Definition: NamdTypes.h:386
int32 * id
Definition: NamdTypes.h:380
int numMolecules
Number of molecules.
Definition: Molecule.h:617
void allocate_host(T **pp, const size_t len)
Definition: CudaUtils.h:296
ScaledPosition getMax()
Definition: HomePatch.h:530
BigReal zx
Definition: CudaUtils.h:82
int32 * transform_i
Definition: NamdTypes.h:400
BigReal x
Definition: Vector.h:74
PatchID getPatchID() const
Definition: Patch.h:114
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
int numAtoms
Definition: Molecule.h:585
int doNonbonded
Definition: PatchTypes.h:22
void NAMD_die(const char *err_msg)
Definition: common.C:147
int periodic_b(void) const
Definition: PatchMap.h:74
NAMD_HOST_DEVICE Vector c_r() const
Definition: Lattice.h:286
BigReal xz
Definition: CudaUtils.h:78
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
BigReal maxAtomMovement
Definition: PatchTypes.h:42
bool isGpuReservedPme()
Definition: DeviceCUDA.h:164
BigReal xx
Definition: Tensor.h:17
#define TWOPI
Definition: common.h:96
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
Definition: CudaUtils.h:427
BigReal zz
Definition: Tensor.h:19
int32 * transform_k
Definition: NamdTypes.h:402
#define simParams
Definition: Output.C:129
iterator begin(void)
Definition: ResizeArray.h:36
double * pos_z
Definition: NamdTypes.h:369
const PatchID patchID
Definition: Patch.h:150
Definition: Tensor.h:15
double * pos_x
Definition: NamdTypes.h:367
BigReal y
Definition: Vector.h:74
bool getIsPmeDevice()
Definition: DeviceCUDA.h:168
int32 * transform_j
Definition: NamdTypes.h:401
double * vel_z
Definition: NamdTypes.h:388
#define ADD_VECTOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:28
BigReal yy
Definition: Tensor.h:18
#define PDBVELFACTOR
Definition: common.h:57
int32 * moleculeStartIndex
starting index of each molecule
Definition: Molecule.h:619
int getDeviceIndex()
Definition: DeviceCUDA.h:166
BigReal pairlistTolerance
Definition: PatchTypes.h:41
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
double * f_global_x
Definition: NamdTypes.h:427
int pairlistsAgeLimit
Definition: Sequencer.h:231
int pairlistsAreValid
Definition: Sequencer.h:229
size_t alchGetNumOfPMEGrids() const
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int destPatchID[3][3][3]
Definition: CudaUtils.h:123
int is_lonepairs_psf
Definition: Molecule.h:490
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
#define namd_reciprocal(x)
Definition: Vector.h:69
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:523
#define RIGID_NONE
Definition: SimParameters.h:79
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
SimParameters *const simParams
Definition: Sequencer.h:320
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
Molecule * molecule
Definition: Node.h:179
BigReal zy
Definition: CudaUtils.h:83
BigReal xy
Definition: CudaUtils.h:77
double BigReal
Definition: common.h:123
CudaPmeOneDevice * getCudaPmeOneDevice()
int32 numAtoms
number of atoms
Definition: NamdTypes.h:446