20 #ifdef NODEGROUP_FORCE_REGISTER 24 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 26 for (int i = 0; i < numPatchesHome; i++) { \ 27 PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \ 28 const int numPatchAtoms = current.numAtoms; \ 29 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 30 offset += numPatchAtoms; \ 32 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream); \ 35 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 37 for (int i = 0; i < numPatchesHomeAndProxy; i++) { \ 38 PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \ 39 const int numPatchAtoms = current.numAtoms; \ 40 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 41 offset += numPatchAtoms; \ 43 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream); \ 47 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 49 for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \ 50 PatchDataSOA& current = elem->patch->patchDataSOA; \ 51 const int numPatchAtoms = current.numAtoms; \ 52 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 53 offset += numPatchAtoms; \ 55 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream); \ 60 void SequencerCUDA::registerSOAPointersToHost(){
62 patchData->h_soa_pos_x[deviceIndex] = this->d_pos_x;
63 patchData->h_soa_pos_y[deviceIndex] = this->d_pos_y;
64 patchData->h_soa_pos_z[deviceIndex] = this->d_pos_z;
66 patchData->h_soa_vel_x[deviceIndex] = this->d_vel_x;
67 patchData->h_soa_vel_y[deviceIndex] = this->d_vel_y;
68 patchData->h_soa_vel_z[deviceIndex] = this->d_vel_z;
70 patchData->h_soa_fb_x[deviceIndex] = this->d_f_normal_x;
71 patchData->h_soa_fb_y[deviceIndex] = this->d_f_normal_y;
72 patchData->h_soa_fb_z[deviceIndex] = this->d_f_normal_z;
74 patchData->h_soa_fn_x[deviceIndex] = this->d_f_nbond_x;
75 patchData->h_soa_fn_y[deviceIndex] = this->d_f_nbond_y;
76 patchData->h_soa_fn_z[deviceIndex] = this->d_f_nbond_z;
78 patchData->h_soa_fs_x[deviceIndex] = this->d_f_slow_x;
79 patchData->h_soa_fs_y[deviceIndex] = this->d_f_slow_y;
80 patchData->h_soa_fs_z[deviceIndex] = this->d_f_slow_z;
82 patchData->h_soa_id[deviceIndex] = this->d_idMig;
83 patchData->h_soa_vdwType[deviceIndex] = this->d_vdwType;
84 patchData->h_soa_sortOrder[deviceIndex] = this->d_sortOrder;
85 patchData->h_soa_unsortOrder[deviceIndex] = this->d_unsortOrder;
86 patchData->h_soa_charge[deviceIndex] = this->d_charge;
87 patchData->h_soa_patchCenter[deviceIndex] = this->d_patchCenter;
88 patchData->h_soa_migrationDestination[deviceIndex] = this->d_migrationDestination;
89 patchData->h_soa_sortSoluteIndex[deviceIndex] = this->d_sortSoluteIndex;
91 patchData->h_soa_partition[deviceIndex] = this->d_partition;
93 patchData->h_atomdata_AoS[deviceIndex] = (
FullAtom*) this->d_atomdata_AoS_in;
94 patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
97 void SequencerCUDA::copySOAHostRegisterToDevice(){
101 copy_HtoD<double*>(patchData->h_soa_pos_x, this->d_peer_pos_x, nDevices, stream);
102 copy_HtoD<double*>(patchData->h_soa_pos_y, this->d_peer_pos_y, nDevices, stream);
103 copy_HtoD<double*>(patchData->h_soa_pos_z, this->d_peer_pos_z, nDevices, stream);
105 copy_HtoD<float*>(patchData->h_soa_charge, this->d_peer_charge, nDevices, stream);
107 copy_HtoD<int*>(patchData->h_soa_partition, this->d_peer_partition, nDevices, stream);
110 copy_HtoD<double*>(patchData->h_soa_vel_x, this->d_peer_vel_x, nDevices, stream);
111 copy_HtoD<double*>(patchData->h_soa_vel_y, this->d_peer_vel_y, nDevices, stream);
112 copy_HtoD<double*>(patchData->h_soa_vel_z, this->d_peer_vel_z, nDevices, stream);
114 copy_HtoD<double*>(patchData->h_soa_fb_x, this->d_peer_fb_x, nDevices, stream);
115 copy_HtoD<double*>(patchData->h_soa_fb_y, this->d_peer_fb_y, nDevices, stream);
116 copy_HtoD<double*>(patchData->h_soa_fb_z, this->d_peer_fb_z, nDevices, stream);
118 copy_HtoD<double*>(patchData->h_soa_fn_x, this->d_peer_fn_x, nDevices, stream);
119 copy_HtoD<double*>(patchData->h_soa_fn_y, this->d_peer_fn_y, nDevices, stream);
120 copy_HtoD<double*>(patchData->h_soa_fn_z, this->d_peer_fn_z, nDevices, stream);
122 copy_HtoD<double*>(patchData->h_soa_fs_x, this->d_peer_fs_x, nDevices, stream);
123 copy_HtoD<double*>(patchData->h_soa_fs_y, this->d_peer_fs_y, nDevices, stream);
124 copy_HtoD<double*>(patchData->h_soa_fs_z, this->d_peer_fs_z, nDevices, stream);
126 copy_HtoD<int4*>(patchData->h_soa_migrationDestination, this->d_peer_migrationDestination, nDevices, stream);
127 copy_HtoD<int*>(patchData->h_soa_sortSoluteIndex, this->d_peer_sortSoluteIndex, nDevices, stream);
129 copy_HtoD<int*>(patchData->h_soa_id, this->d_peer_id, nDevices, stream);
130 copy_HtoD<int*>(patchData->h_soa_vdwType, this->d_peer_vdwType, nDevices, stream);
131 copy_HtoD<int*>(patchData->h_soa_sortOrder, this->d_peer_sortOrder, nDevices, stream);
132 copy_HtoD<int*>(patchData->h_soa_unsortOrder, this->d_peer_unsortOrder, nDevices, stream);
133 copy_HtoD<double3*>(patchData->h_soa_patchCenter, this->d_peer_patchCenter, nDevices, stream);
135 copy_HtoD<FullAtom*>(patchData->h_atomdata_AoS, this->d_peer_atomdata, nDevices, stream);
136 copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);
139 for(
int i = 0; i < this->nDevices; i++)
140 h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
141 copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
144 void SequencerCUDA::printSOAPositionsAndVelocities() {
155 copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
156 copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
157 copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
159 copy_DtoH_sync<BigReal>(d_pos_x, h_pos_x, numAtomsHome);
160 copy_DtoH_sync<BigReal>(d_pos_y, h_pos_y, numAtomsHome);
161 copy_DtoH_sync<BigReal>(d_pos_z, h_pos_z, numAtomsHome);
164 copy_DtoH_sync<BigReal>(d_vel_x, h_vel_x, numAtomsHome);
165 copy_DtoH_sync<BigReal>(d_vel_y, h_vel_y, numAtomsHome);
166 copy_DtoH_sync<BigReal>(d_vel_z, h_vel_z, numAtomsHome);
168 CmiLock(this->patchData->printlock);
169 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
171 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
172 for(
int i =0 ; i < numPatchesHome; i++){
174 const int patchID = record.
patchID;
176 const int numPatchAtoms = record.
numAtoms;
179 fprintf(stderr,
"Patch [%d]:\n", patchID);
180 for(
int j = 0; j < numPatchAtoms; j++){
181 fprintf(stderr,
" [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.
id[j],
182 h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
183 h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
187 CmiUnlock(this->patchData->printlock);
197 void SequencerCUDA::printSOAForces(
char *prefix) {
210 copy_DtoH_sync<BigReal>(d_f_normal_x, h_f_normal_x, numAtomsHome);
211 copy_DtoH_sync<BigReal>(d_f_normal_y, h_f_normal_y, numAtomsHome);
212 copy_DtoH_sync<BigReal>(d_f_normal_z, h_f_normal_z, numAtomsHome);
214 copy_DtoH_sync<BigReal>(d_f_nbond_x, h_f_nbond_x, numAtomsHome);
215 copy_DtoH_sync<BigReal>(d_f_nbond_y, h_f_nbond_y, numAtomsHome);
216 copy_DtoH_sync<BigReal>(d_f_nbond_z, h_f_nbond_z, numAtomsHome);
218 copy_DtoH_sync<BigReal>(d_f_slow_x, h_f_slow_x, numAtomsHome);
219 copy_DtoH_sync<BigReal>(d_f_slow_y, h_f_slow_y, numAtomsHome);
220 copy_DtoH_sync<BigReal>(d_f_slow_z, h_f_slow_z, numAtomsHome);
223 CmiLock(this->patchData->printlock);
224 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
226 fprintf(stderr,
"PE[%d] force printout\n", CkMyPe());
227 for(
int i =0 ; i < numPatchesHome; i++){
229 const int patchID = record.
patchID;
231 const int numPatchAtoms = record.
numAtoms;
232 FILE *outfile=stderr;
235 snprintf(fname,100,
"%s-patch-%d", prefix, patchID);
236 outfile = fopen(fname,
"w");
238 fprintf(outfile,
"Patch [%d]:\n", patchID);
239 for(
int j = 0; j < numPatchAtoms; j++){
240 fprintf(outfile,
" [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
241 h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
242 h_f_nbond_x[stride+j], h_f_nbond_y[stride+j], h_f_nbond_z[stride+j],
243 h_f_slow_x[stride+j], h_f_slow_y[stride+j], h_f_slow_z[stride+j] );
245 if(prefix!=NULL) fclose(outfile);
248 CmiUnlock(this->patchData->printlock);
264 SequencerCUDA* SequencerCUDA::InstanceInit(
const int deviceID_ID,
266 if (CkpvAccess(SequencerCUDA_instance) == 0) {
267 CkpvAccess(SequencerCUDA_instance) =
new SequencerCUDA(deviceID_ID, sim_Params);
269 return CkpvAccess(SequencerCUDA_instance);
272 SequencerCUDA::SequencerCUDA(
const int deviceID_ID,
274 deviceID(deviceID_ID),
simParams(sim_Params)
276 restraintsKernel = NULL;
278 groupRestraintsKernel = NULL;
279 gridForceKernel = NULL;
280 consForceKernel = NULL;
281 lonepairsKernel =
nullptr;
283 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
284 patchData = cpdata.ckLocalBranch();
287 CUDASequencerKernel =
new SequencerCUDAKernel();
288 CUDAMigrationKernel =
new MigrationCUDAKernel();
290 used_grids.resize(num_used_grids, 0);
326 rescalePairlistTolerance =
false;
329 SequencerCUDA::~SequencerCUDA(){
332 deallocateStaticArrays();
333 deallocate_device<SettleParameters>(&sp);
334 deallocate_device<int>(&settleList);
335 deallocate_device<CudaRattleElem>(&rattleList);
336 deallocate_device<int>(&d_consFailure);
337 if (CUDASequencerKernel != NULL)
delete CUDASequencerKernel;
338 if (CUDAMigrationKernel != NULL)
delete CUDAMigrationKernel;
339 if (restraintsKernel != NULL)
delete restraintsKernel;
340 if(SMDKernel != NULL)
delete SMDKernel;
341 if (groupRestraintsKernel != NULL)
delete groupRestraintsKernel;
342 if (gridForceKernel != NULL)
delete gridForceKernel;
343 if (lonepairsKernel !=
nullptr)
delete lonepairsKernel;
344 if (consForceKernel != NULL)
delete consForceKernel;
350 CmiDestroyLock(printlock);
354 void SequencerCUDA::zeroScalars(){
355 numAtomsHomeAndProxyAllocated = 0;
356 numAtomsHomeAllocated = 0;
357 buildRigidLists =
true;
358 numPatchesCheckedIn = 0;
362 void SequencerCUDA::initialize(){
366 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 367 int leastPriority, greatestPriority;
368 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
370 cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
374 cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
379 curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
381 unsigned long long seed =
simParams->randomSeed + CkMyPe();
382 curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));
384 numAtomsHomeAllocated = 0;
385 numAtomsHomeAndProxyAllocated = 0;
387 totalMarginViolations = 0;
388 buildRigidLists =
true;
389 numPatchesCheckedIn = 0;
397 mGpuOn = nDevices > 1;
404 printlock = CmiCreateLock();
406 const int numPes = CkNumPes();
408 atomMapList.resize(numPes);
412 allocate_device<double*>(&d_peer_pos_x, nDevices);
413 allocate_device<double*>(&d_peer_pos_y, nDevices);
414 allocate_device<double*>(&d_peer_pos_z, nDevices);
415 allocate_device<float*>(&d_peer_charge, nDevices);
417 allocate_device<int*>(&d_peer_partition, nDevices);
419 allocate_device<double*>(&d_peer_vel_x, nDevices);
420 allocate_device<double*>(&d_peer_vel_y, nDevices);
421 allocate_device<double*>(&d_peer_vel_z, nDevices);
424 allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
425 allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
426 allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
427 allocate_device<double3>(&d_fixForceNormal, 1);
428 allocate_device<double3>(&d_fixForceNbond, 1);
429 allocate_device<double3>(&d_fixForceSlow, 1);
433 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
434 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
435 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
438 allocate_device<double*>(&d_peer_fb_x, nDevices);
439 allocate_device<double*>(&d_peer_fb_y, nDevices);
440 allocate_device<double*>(&d_peer_fb_z, nDevices);
441 allocate_device<double*>(&d_peer_fn_x, nDevices);
442 allocate_device<double*>(&d_peer_fn_y, nDevices);
443 allocate_device<double*>(&d_peer_fn_z, nDevices);
444 allocate_device<double*>(&d_peer_fs_x, nDevices);
445 allocate_device<double*>(&d_peer_fs_y, nDevices);
446 allocate_device<double*>(&d_peer_fs_z, nDevices);
448 allocate_device<int4*>(&d_peer_migrationDestination, nDevices);
449 allocate_device<int*>(&d_peer_sortSoluteIndex, nDevices);
450 allocate_device<int*>(&d_peer_id, nDevices);
451 allocate_device<int*>(&d_peer_vdwType, nDevices);
452 allocate_device<int*>(&d_peer_sortOrder, nDevices);
453 allocate_device<int*>(&d_peer_unsortOrder, nDevices);
454 allocate_device<double3*>(&d_peer_patchCenter, nDevices);
456 allocate_device<FullAtom*>(&d_peer_atomdata, nDevices);
457 allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);
459 allocate_device<bool*>(&d_patchRecordHasForces, nDevices);
461 allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
463 allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
464 allocate_host<double3>(&patchCenter, numPatchesGlobal);
465 allocate_host<int>(&globalToLocalID, numPatchesGlobal);
466 allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
467 allocate_host<double3>(&awayDists, numPatchesGlobal);
468 allocate_host<double3>(&patchMin, numPatchesGlobal);
469 allocate_host<double3>(&patchMax, numPatchesGlobal);
471 allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal);
472 allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
473 allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
474 allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);
477 allocate_device<double3>(&d_awayDists, numPatchesGlobal);
478 allocate_device<double3>(&d_patchMin, numPatchesGlobal);
479 allocate_device<double3>(&d_patchMax, numPatchesGlobal);
480 allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
481 allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
482 allocate_device<double3>(&d_patchCenter, numPatchesGlobal);
483 allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
484 allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
485 allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
486 allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
487 allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);
490 allocate_device<int>(&d_killme, 1);
491 allocate_device<char>(&d_barrierFlag, 1);
492 allocate_device<unsigned int>(&d_tbcatomic, 5);
493 allocate_device<BigReal>(&d_kineticEnergy,
ATOMIC_BINS);
494 allocate_device<BigReal>(&d_intKineticEnergy,
ATOMIC_BINS);
495 allocate_device<BigReal>(&d_momentum_x,
ATOMIC_BINS);
496 allocate_device<BigReal>(&d_momentum_y,
ATOMIC_BINS);
497 allocate_device<BigReal>(&d_momentum_z,
ATOMIC_BINS);
498 allocate_device<BigReal>(&d_angularMomentum_x,
ATOMIC_BINS);
499 allocate_device<BigReal>(&d_angularMomentum_y,
ATOMIC_BINS);
500 allocate_device<BigReal>(&d_angularMomentum_z,
ATOMIC_BINS);
501 allocate_device<cudaTensor>(&d_virial,
ATOMIC_BINS);
502 allocate_device<cudaTensor>(&d_intVirialNormal,
ATOMIC_BINS);
503 allocate_device<cudaTensor>(&d_intVirialNbond,
ATOMIC_BINS);
504 allocate_device<cudaTensor>(&d_intVirialSlow,
ATOMIC_BINS);
505 allocate_device<cudaTensor>(&d_rigidVirial,
ATOMIC_BINS);
507 allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
508 allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
509 allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
511 allocate_device<cudaTensor>(&d_extVirial,
ATOMIC_BINS * EXT_FORCE_TOTAL);
512 allocate_device<double3>(&d_extForce,
ATOMIC_BINS * EXT_FORCE_TOTAL);
513 allocate_device<double>(&d_extEnergy,
ATOMIC_BINS * EXT_FORCE_TOTAL);
515 allocate_device<SettleParameters>(&sp, 1);
518 allocate_host<int>(&killme, 1);
519 allocate_host<BigReal>(&kineticEnergy, 1);
520 allocate_host<BigReal>(&intKineticEnergy, 1);
521 allocate_host<BigReal>(&kineticEnergy_half, 1);
522 allocate_host<BigReal>(&intKineticEnergy_half, 1);
523 allocate_host<BigReal>(&momentum_x, 1);
524 allocate_host<BigReal>(&momentum_y, 1);
525 allocate_host<BigReal>(&momentum_z, 1);
526 allocate_host<BigReal>(&angularMomentum_x, 1);
527 allocate_host<BigReal>(&angularMomentum_y, 1);
528 allocate_host<BigReal>(&angularMomentum_z, 1);
529 allocate_host<int>(&consFailure, 1);
530 allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
531 allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
532 allocate_host<unsigned int>(&h_marginViolations, 1);
533 allocate_host<unsigned int>(&h_periodicCellSmall, 1);
536 allocate_host<cudaTensor>(&virial, 1);
537 allocate_host<cudaTensor>(&virial_half, 1);
538 allocate_host<cudaTensor>(&intVirialNormal, 1);
539 allocate_host<cudaTensor>(&intVirialNormal_half, 1);
540 allocate_host<cudaTensor>(&intVirialNbond, 1);
541 allocate_host<cudaTensor>(&intVirialSlow, 1);
542 allocate_host<cudaTensor>(&rigidVirial, 1);
543 allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
544 allocate_host<cudaTensor>(&lpVirialNormal, 1);
545 allocate_host<cudaTensor>(&lpVirialNbond, 1);
546 allocate_host<cudaTensor>(&lpVirialSlow, 1);
549 d_f_saved_nbond_x =
nullptr;
550 d_f_saved_nbond_y =
nullptr;
551 d_f_saved_nbond_z =
nullptr;
552 d_f_saved_slow_x =
nullptr;
553 d_f_saved_slow_y =
nullptr;
554 d_f_saved_slow_z =
nullptr;
557 *kineticEnergy = 0.0;
558 *intKineticEnergy = 0.0;
559 *kineticEnergy_half = 0.0;
560 *intKineticEnergy_half = 0.0;
564 *angularMomentum_x = 0.0;
565 *angularMomentum_y = 0.0;
566 *angularMomentum_z = 0.0;
575 t_setComputePositions = 0;
576 t_accumulateForceKick = 0;
579 t_submitReductions1 = 0;
580 t_submitReductions2 = 0;
582 cudaEventCreate(&eventStart);
583 cudaEventCreate(&eventStop);
584 cudaCheck(cudaMemset(d_patchNewTolerance, 0,
sizeof(
BigReal)*numPatchesGlobal));
586 cudaCheck(cudaMemset(d_tbcatomic, 0,
sizeof(
unsigned int) * 5));
606 memset(h_marginViolations, 0,
sizeof(
unsigned int));
607 memset(h_periodicCellSmall, 0,
sizeof(
unsigned int));
610 memset(intVirialNormal, 0,
sizeof(
cudaTensor));
611 memset(intVirialNbond, 0,
sizeof(
cudaTensor));
613 memset(lpVirialNormal, 0,
sizeof(
cudaTensor));
616 memset(globalToLocalID, -1,
sizeof(
int)*numPatchesGlobal);
622 d_consFailure = NULL;
623 d_consFailureSize = 0;
630 numPatchesHome = numPatchesGlobal;
638 for(
int i = 0; i < numPes; i++) {
642 for (
int j = 0; j < npatch; j++) {
646 patchList.push_back(patch);
647 patchNewTolerance[count++] =
651 patchData->devData[deviceIndex].patches.push_back(patch);
652 patchListHomeAndProxy.push_back(patch);
670 for (
int i = 0; i < numPes; ++i) {
681 patchData->devData[deviceIndex].patches.push_back(patch);
688 #ifdef NAMD_NCCL_ALLREDUCE 689 deviceCUDA->setNcclUniqueId(patchData->ncclId);
701 restraintsKernel =
new ComputeRestraintsCUDA(patchList, atomMapList,
714 SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
716 SMDKernel->initPeerCOM(cudaMgr->
curSMDCOM, stream);
722 groupRestraintsKernel =
new ComputeGroupRestraintsCUDA(
simParams->outputEnergies,
723 simParams->groupRestraints, mGpuOn, nDevices, deviceIndex);
726 groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
727 groupRestraintsKernel->initPeerCOM(stream);
733 gridForceKernel =
new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
741 consForceKernel =
new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
763 void SequencerCUDA::updateDeviceKernels()
768 if(patchData->updateCounter.fetch_sub(1)>=1)
770 if(gridForceKernel!=NULL)
772 delete gridForceKernel;
773 gridForceKernel =
new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
774 gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
776 if(consForceKernel!=NULL)
778 delete consForceKernel;
779 consForceKernel =
new ComputeConsForceCUDA(patchList, atomMapList,
781 consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
788 bool SequencerCUDA::reallocateArrays(
int in_numAtomsHome,
int in_numAtomsHomeAndProxy)
791 const float OVERALLOC = 1.5f;
793 if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
799 bool realloc_gpu_saved_force =
false;
800 if (d_f_saved_nbond_x !=
nullptr || d_f_saved_slow_x !=
nullptr) {
801 realloc_gpu_saved_force =
true;
807 numAtomsHomeAndProxyAllocated = (int) ((
float) in_numAtomsHomeAndProxy * OVERALLOC);
808 numAtomsHomeAllocated = (int) ((
float) in_numAtomsHome * OVERALLOC);
810 allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
811 allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
812 allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
813 allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
814 allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
815 allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
816 allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
817 allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
818 allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
819 allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
820 allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
821 allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
823 allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
824 allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
825 allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
827 allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
828 allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
829 allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);
831 allocate_host<double>(&recipMass, numAtomsHomeAllocated);
832 allocate_host<double>(&vel_x, numAtomsHomeAllocated);
833 allocate_host<double>(&vel_y, numAtomsHomeAllocated);
834 allocate_host<double>(&vel_z, numAtomsHomeAllocated);
835 allocate_host<char3>(&transform, numAtomsHomeAllocated);
836 allocate_host<float>(&mass, numAtomsHomeAllocated);
838 allocate_host<int>(&
partition, numAtomsHomeAndProxyAllocated);
840 allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
841 allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
842 allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);
847 int n = (numAtomsHomeAllocated + 1) & (~1);
848 allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
850 allocate_host<int>(&atomFixed, numAtomsHomeAllocated);
852 allocate_host<int>(&groupFixed, numAtomsHomeAllocated);
857 allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);
861 allocate_host<int>(&idMig, numAtomsHomeAllocated);
862 allocate_host<int>(&vdwType, numAtomsHomeAllocated);
865 allocate_device<double>(&d_f_raw, 9 * numAtomsHomeAndProxyAllocated);
866 d_f_normal_x = &d_f_raw[numAtomsHomeAndProxyAllocated*0];
867 d_f_normal_y = &d_f_raw[numAtomsHomeAndProxyAllocated*1];
868 d_f_normal_z = &d_f_raw[numAtomsHomeAndProxyAllocated*2];
869 d_f_nbond_x = &d_f_raw[numAtomsHomeAndProxyAllocated*3];
870 d_f_nbond_y = &d_f_raw[numAtomsHomeAndProxyAllocated*4];
871 d_f_nbond_z = &d_f_raw[numAtomsHomeAndProxyAllocated*5];
872 d_f_slow_x = &d_f_raw[numAtomsHomeAndProxyAllocated*6];
873 d_f_slow_y = &d_f_raw[numAtomsHomeAndProxyAllocated*7];
874 d_f_slow_z = &d_f_raw[numAtomsHomeAndProxyAllocated*8];
875 allocate_device<double>(&d_pos_raw, 3 * numAtomsHomeAndProxyAllocated);
876 d_pos_x = &d_pos_raw[numAtomsHomeAndProxyAllocated*0];
877 d_pos_y = &d_pos_raw[numAtomsHomeAndProxyAllocated*1];
878 d_pos_z = &d_pos_raw[numAtomsHomeAndProxyAllocated*2];
879 allocate_device<float>(&d_charge, numAtomsHomeAndProxyAllocated);
881 allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
882 allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
883 allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
885 if (realloc_gpu_saved_force) {
886 allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
887 allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
888 allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
889 allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
890 allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
891 allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
893 allocate_device<int>(&d_sortOrder, numAtomsHomeAndProxyAllocated);
894 allocate_device<int>(&d_unsortOrder, numAtomsHomeAndProxyAllocated);
899 allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
900 allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
901 d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
902 d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
903 d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
904 d_f_nbondMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
905 d_f_nbondMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
906 d_f_nbondMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
907 d_f_slowMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
908 d_f_slowMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
909 d_f_slowMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
910 d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
911 d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
912 d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];
914 allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
915 allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
916 allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
917 allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
919 allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
923 allocate_device<int>(&d_partition, numAtomsHomeAndProxyAllocated);
926 allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
927 d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
928 d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
929 d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
930 allocate_device<double>(&d_vel_x, numAtomsHomeAllocated);
931 allocate_device<double>(&d_vel_y, numAtomsHomeAllocated);
932 allocate_device<double>(&d_vel_z, numAtomsHomeAllocated);
933 allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
934 allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
935 allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
936 allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
937 allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
938 allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
939 allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
940 allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
941 allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
942 allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
943 allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
944 allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
945 allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
946 allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);
948 allocate_device<float>(&d_mass, numAtomsHomeAllocated);
949 allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
950 allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
951 allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
952 allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
953 allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
954 allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
955 allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
956 allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
958 allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
960 allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
961 allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
962 allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
963 allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
967 allocate_device<int>(&d_idMig, numAtomsHomeAndProxyAllocated);
968 allocate_device<int>(&d_vdwType, numAtomsHomeAndProxyAllocated);
969 allocate_device<FullAtom>(&d_atomdata_AoS, numAtomsHomeAllocated);
970 allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
971 allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
972 allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
976 d_f_saved_nbond_x =
nullptr;
977 d_f_saved_nbond_y =
nullptr;
978 d_f_saved_nbond_z =
nullptr;
979 d_f_saved_slow_x =
nullptr;
980 d_f_saved_slow_y =
nullptr;
981 d_f_saved_slow_z =
nullptr;
984 memset(pos_x, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
985 memset(pos_y, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
986 memset(pos_z, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
987 cudaCheck(cudaMemset(d_pos_x, 0 ,
sizeof(
double)*numAtomsHomeAndProxyAllocated));
988 cudaCheck(cudaMemset(d_pos_y, 0 ,
sizeof(
double)*numAtomsHomeAndProxyAllocated));
989 cudaCheck(cudaMemset(d_pos_z, 0 ,
sizeof(
double)*numAtomsHomeAndProxyAllocated));
990 cudaCheck(cudaMemset(d_vel_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
991 cudaCheck(cudaMemset(d_vel_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
992 cudaCheck(cudaMemset(d_vel_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
994 cudaCheck(cudaMemset(d_posNew_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
995 cudaCheck(cudaMemset(d_posNew_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
996 cudaCheck(cudaMemset(d_posNew_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
997 cudaCheck(cudaMemset(d_velNew_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
998 cudaCheck(cudaMemset(d_velNew_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
999 cudaCheck(cudaMemset(d_velNew_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
1004 void SequencerCUDA::reallocateMigrationDestination() {
1005 if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1006 allocate_device<int4>(&d_migrationDestination, numAtomsHomeAndProxyAllocated);
1009 void SequencerCUDA::deallocateArrays() {
1010 if (numAtomsHomeAndProxyAllocated != 0) {
1013 deallocate_host<double>(&f_normal_x);
1014 deallocate_host<double>(&f_normal_y);
1015 deallocate_host<double>(&f_normal_z);
1017 deallocate_host<double>(&f_global_x);
1018 deallocate_host<double>(&f_global_y);
1019 deallocate_host<double>(&f_global_z);
1021 deallocate_host<double>(&f_nbond_x);
1022 deallocate_host<double>(&f_nbond_y);
1023 deallocate_host<double>(&f_nbond_z);
1024 deallocate_host<double>(&f_slow_x);
1025 deallocate_host<double>(&f_slow_y);
1026 deallocate_host<double>(&f_slow_z);
1027 deallocate_host<double>(&pos_x);
1028 deallocate_host<double>(&pos_y);
1029 deallocate_host<double>(&pos_z);
1030 deallocate_host<float>(&charge);
1031 deallocate_host<int>(&sortOrder);
1032 deallocate_host<int>(&unsortOrder);
1033 deallocate_host<double>(&recipMass);
1034 deallocate_host<double>(&vel_x);
1035 deallocate_host<double>(&vel_y);
1036 deallocate_host<double>(&vel_z);
1037 deallocate_host<char3>(&transform);
1038 deallocate_host<float>(&mass);
1042 deallocate_host<float>(&langevinParam);
1043 deallocate_host<float>(&langScalVelBBK2);
1044 deallocate_host<float>(&langScalRandBBK2);
1046 deallocate_host<int>(&hydrogenGroupSize);
1047 deallocate_host<int>(&atomFixed);
1049 deallocate_host<int>(&groupFixed);
1050 deallocate_host<double>(&fixedPosition_x);
1051 deallocate_host<double>(&fixedPosition_y);
1052 deallocate_host<double>(&fixedPosition_z);
1055 deallocate_host<float>(&rigidBondLength);
1057 deallocate_device<double>(&d_f_raw);
1058 deallocate_device<double>(&d_pos_raw);
1059 deallocate_device<float>(&d_charge);
1060 deallocate_device<int>(&d_sortOrder);
1061 deallocate_device<int>(&d_unsortOrder);
1063 deallocate_device<int>(&d_partition);
1066 deallocate_device<double>(&d_posNew_raw);
1069 deallocate_device<double>(&d_f_rawMC);
1070 deallocate_device<double>(&d_pos_rawMC);
1072 deallocate_host<int>(&id);
1073 deallocate_device<int>(&d_id);
1074 deallocate_device<int>(&d_idOrder);
1075 deallocate_device<int>(&d_moleculeAtom);
1076 deallocate_device<int>(&d_moleculeStartIndex);
1080 deallocate_host<int>(&idMig);
1081 deallocate_host<int>(&vdwType);
1082 deallocate_device<int>(&d_idMig);
1083 deallocate_device<int>(&d_vdwType);
1084 deallocate_device<FullAtom>(&d_atomdata_AoS);
1085 deallocate_device<int>(&d_migrationGroupSize);
1086 deallocate_device<int>(&d_migrationGroupIndex);
1087 deallocate_device<int>(&d_sortIndex);
1090 deallocate_device<double>(&d_f_global_x);
1091 deallocate_device<double>(&d_f_global_y);
1092 deallocate_device<double>(&d_f_global_z);
1094 deallocate_device<double>(&d_vel_x);
1095 deallocate_device<double>(&d_vel_y);
1096 deallocate_device<double>(&d_vel_z);
1097 deallocate_device<double>(&d_recipMass);
1098 deallocate_device<char3>(&d_transform);
1099 deallocate_device<double>(&d_velNew_x);
1100 deallocate_device<double>(&d_velNew_y);
1101 deallocate_device<double>(&d_velNew_z);
1102 deallocate_device<double>(&d_posSave_x);
1103 deallocate_device<double>(&d_posSave_y);
1104 deallocate_device<double>(&d_posSave_z);
1105 deallocate_device<double>(&d_rcm_x);
1106 deallocate_device<double>(&d_rcm_y);
1107 deallocate_device<double>(&d_rcm_z);
1108 deallocate_device<double>(&d_vcm_x);
1109 deallocate_device<double>(&d_vcm_y);
1110 deallocate_device<double>(&d_vcm_z);
1111 deallocate_device<float>(&d_mass);
1112 deallocate_device<float>(&d_langevinParam);
1113 deallocate_device<float>(&d_langScalVelBBK2);
1114 deallocate_device<float>(&d_langScalRandBBK2);
1115 deallocate_device<float>(&d_gaussrand_x);
1116 deallocate_device<float>(&d_gaussrand_y);
1117 deallocate_device<float>(&d_gaussrand_z);
1118 deallocate_device<int>(&d_hydrogenGroupSize);
1119 deallocate_device<float>(&d_rigidBondLength);
1120 deallocate_device<int>(&d_atomFixed);
1122 deallocate_device<int>(&d_groupFixed);
1123 deallocate_device<double>(&d_fixedPosition_x);
1124 deallocate_device<double>(&d_fixedPosition_y);
1125 deallocate_device<double>(&d_fixedPosition_z);
1127 deallocate_device<double>(&d_f_saved_nbond_x);
1128 deallocate_device<double>(&d_f_saved_nbond_y);
1129 deallocate_device<double>(&d_f_saved_nbond_z);
1130 deallocate_device<double>(&d_f_saved_slow_x);
1131 deallocate_device<double>(&d_f_saved_slow_y);
1132 deallocate_device<double>(&d_f_saved_slow_z);
1136 void SequencerCUDA::deallocateStaticArrays() {
1139 deallocate_host<cudaTensor>(&extVirial);
1140 deallocate_host<double3>(&extForce);
1141 deallocate_host<double>(&extEnergy);
1142 deallocate_host<unsigned int>(&h_marginViolations);
1143 deallocate_host<unsigned int>(&h_periodicCellSmall);
1146 deallocate_host<double3>(&awayDists);
1147 deallocate_host<double3>(&patchMin);
1148 deallocate_host<double3>(&patchMax);
1149 deallocate_host<CudaAtom*>(&cudaAtomLists);
1150 deallocate_host<double3>(&patchCenter);
1151 deallocate_host<int>(&globalToLocalID);
1152 deallocate_host<int>(&patchToDeviceMap);
1153 deallocate_host<Lattice>(&pairlist_lattices);
1154 deallocate_host<double>(&patchMaxAtomMovement);
1155 deallocate_host<double>(&patchNewTolerance);
1156 deallocate_host<CudaMInfo>(&mInfo);
1157 deallocate_host<bool*>(&h_patchRecordHasForces);
1159 deallocate_host<cudaTensor>(&lpVirialNormal);
1160 deallocate_host<cudaTensor>(&lpVirialNbond);
1161 deallocate_host<cudaTensor>(&lpVirialSlow);
1163 deallocate_device<double3>(&d_awayDists);
1164 deallocate_device<double3>(&d_patchMin);
1165 deallocate_device<double3>(&d_patchMax);
1166 deallocate_device<int>(&d_globalToLocalID);
1167 deallocate_device<int>(&d_patchToDeviceMap);
1168 deallocate_device<int>(&d_sortOrder);
1169 deallocate_device<int>(&d_unsortOrder);
1170 deallocate_device<double3>(&d_patchCenter);
1171 deallocate_device<Lattice>(&d_lattices);
1172 deallocate_device<Lattice>(&d_pairlist_lattices);
1173 deallocate_device<double>(&d_patchMaxAtomMovement);
1174 deallocate_device<double>(&d_patchNewTolerance);
1175 deallocate_device<CudaMInfo>(&d_mInfo);
1177 deallocate_device<int>(&d_killme);
1178 deallocate_device<char>(&d_barrierFlag);
1179 deallocate_device<unsigned int>(&d_tbcatomic);
1180 deallocate_device<BigReal>(&d_kineticEnergy);
1181 deallocate_device<BigReal>(&d_intKineticEnergy);
1182 deallocate_device<BigReal>(&d_momentum_x);
1183 deallocate_device<BigReal>(&d_momentum_y);
1184 deallocate_device<BigReal>(&d_momentum_z);
1185 deallocate_device<BigReal>(&d_angularMomentum_x);
1186 deallocate_device<BigReal>(&d_angularMomentum_y);
1187 deallocate_device<BigReal>(&d_angularMomentum_z);
1188 deallocate_device<cudaTensor>(&d_virial);
1189 deallocate_device<cudaTensor>(&d_intVirialNormal);
1190 deallocate_device<cudaTensor>(&d_intVirialNbond);
1191 deallocate_device<cudaTensor>(&d_intVirialSlow);
1192 deallocate_device<cudaTensor>(&d_lpVirialNormal);
1193 deallocate_device<cudaTensor>(&d_lpVirialNbond);
1194 deallocate_device<cudaTensor>(&d_lpVirialSlow);
1195 deallocate_device<cudaTensor>(&d_rigidVirial);
1196 deallocate_device<cudaTensor>(&d_extVirial);
1197 deallocate_device<double3>(&d_extForce);
1198 deallocate_device<double>(&d_extEnergy);
1199 deallocate_device<SettleParameters>(&sp);
1200 deallocate_device<unsigned int>(&deviceQueue);
1201 deallocate_device<double*>(&d_peer_pos_x);
1202 deallocate_device<double*>(&d_peer_pos_y);
1203 deallocate_device<double*>(&d_peer_pos_z);
1204 deallocate_device<float*>(&d_peer_charge);
1205 deallocate_device<int*>(&d_peer_partition);
1206 deallocate_device<double*>(&d_peer_vel_x);
1207 deallocate_device<double*>(&d_peer_vel_y);
1208 deallocate_device<double*>(&d_peer_vel_z);
1209 deallocate_device<double*>(&d_peer_fb_x);
1210 deallocate_device<double*>(&d_peer_fb_y);
1211 deallocate_device<double*>(&d_peer_fb_z);
1212 deallocate_device<double*>(&d_peer_fn_x);
1213 deallocate_device<double*>(&d_peer_fn_y);
1214 deallocate_device<double*>(&d_peer_fn_z);
1215 deallocate_device<double*>(&d_peer_fs_x);
1216 deallocate_device<double*>(&d_peer_fs_y);
1217 deallocate_device<double*>(&d_peer_fs_z);
1219 deallocate_device<bool*>(&d_patchRecordHasForces);
1229 deallocate_device<FullAtom>(&d_atomdata_AoS_in);
1230 deallocate_device<int>(&d_sortSoluteIndex);
1231 if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1234 deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
1237 void SequencerCUDA::copyMigrationInfo(
HomePatch *p,
int patchIndex){
1239 if (!p->patchMapRead) p->readPatchMap();
1240 for(
int x = 0; x < 3; x++){
1241 for(
int y = 0; y < 3; y++){
1242 for(
int z = 0; z < 3; z++){
1255 void SequencerCUDA::assembleOrderedPatchList(){
1260 for (
int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
1261 HomePatch *p = patchData->devData[deviceIndex].patches[i];
1262 patchList.push_back(p);
1268 patchListHomeAndProxy.clear();
1270 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1271 for (
int i = 0; i < numPatchesHomeAndProxy; i++) {
1272 const int patchID = localPatches[i].
patchID;
1275 for(
int d = 0; d < CkNumPes(); d++){
1279 patchListHomeAndProxy.push_back(patch);
1294 void SequencerCUDA::copyAoSDataToHost() {
1295 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1296 patchData = cpdata.ckLocalBranch();
1298 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1299 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1301 CUDAMigrationKernel->update_AoS(
1303 patchData->devData[deviceIndex].d_localPatches,
1305 d_vel_x, d_vel_y, d_vel_z,
1306 d_pos_x, d_pos_y, d_pos_z,
1310 for (
int i = 0; i < integrationPatches.size(); i++) {
1311 const int numAtoms = localPatches[i].numAtoms;
1312 const int offset = localPatches[i].bufferOffset;
1313 HomePatch *patch = integrationPatches[i];
1317 copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (
FullAtom*)h_atomdata.
begin(), numAtoms, stream);
1319 cudaCheck(cudaStreamSynchronize(stream));
1329 void SequencerCUDA::migrationLocalInit() {
1330 CUDAMigrationKernel->computeMigrationGroupIndex(
1332 patchData->devData[deviceIndex].d_localPatches,
1333 d_migrationGroupSize,
1334 d_migrationGroupIndex,
1338 CUDAMigrationKernel->update_AoS(
1340 patchData->devData[deviceIndex].d_localPatches,
1342 d_vel_x, d_vel_y, d_vel_z,
1343 d_pos_x, d_pos_y, d_pos_z,
1347 CUDAMigrationKernel->computeMigrationDestination(
1349 patchData->devData[deviceIndex].d_localPatches,
1356 d_hydrogenGroupSize,
1357 d_migrationGroupSize,
1358 d_migrationGroupIndex,
1359 d_pos_x, d_pos_y, d_pos_z,
1360 d_migrationDestination,
1364 CUDAMigrationKernel->performLocalMigration(
1366 patchData->devData[deviceIndex].d_localPatches,
1369 d_migrationDestination,
1373 cudaCheck(cudaStreamSynchronize(stream));
1380 void SequencerCUDA::migrationPerform() {
1381 CUDAMigrationKernel->performMigration(
1383 patchData->devData[deviceIndex].d_localPatches,
1387 d_migrationGroupSize,
1388 d_migrationGroupIndex,
1389 d_migrationDestination,
1392 cudaCheck(cudaStreamSynchronize(stream));
1396 void SequencerCUDA::migrationUpdateAtomCounts() {
1397 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1398 patchData = cpdata.ckLocalBranch();
1400 CUDAMigrationKernel->updateLocalRecords(
1402 patchData->devData[deviceIndex].d_localPatches,
1404 patchData->devData[deviceIndex].d_peerPatches,
1408 cudaCheck(cudaStreamSynchronize(stream));
1411 void SequencerCUDA::migrationUpdateAtomOffsets() {
1412 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1413 patchData = cpdata.ckLocalBranch();
1415 CUDAMigrationKernel->updateLocalRecordsOffset(
1416 numPatchesHomeAndProxy,
1417 patchData->devData[deviceIndex].d_localPatches,
1421 cudaCheck(cudaStreamSynchronize(stream));
1424 void SequencerCUDA::migrationUpdateRemoteOffsets() {
1425 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1426 patchData = cpdata.ckLocalBranch();
1428 CUDAMigrationKernel->updatePeerRecords(
1429 numPatchesHomeAndProxy,
1430 patchData->devData[deviceIndex].d_localPatches,
1432 patchData->devData[deviceIndex].d_peerPatches,
1436 cudaCheck(cudaStreamSynchronize(stream));
1439 void SequencerCUDA::migrationUpdateProxyDestination() {
1441 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1442 patchData = cpdata.ckLocalBranch();
1446 CUDAMigrationKernel->copyMigrationDestinationToProxies(
1449 numPatchesHomeAndProxy,
1450 patchData->devData[deviceIndex].d_localPatches,
1451 patchData->devData[deviceIndex].d_peerPatches,
1452 d_peer_migrationDestination,
1458 void SequencerCUDA::copyPatchDataToHost() {
1459 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1460 patchData = cpdata.ckLocalBranch();
1461 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1463 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1464 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1466 copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
1467 cudaCheck(cudaStreamSynchronize(stream));
1471 for (
int i = 0; i < numPatchesHome; i++) {
1475 cudaCheck(cudaStreamSynchronize(stream));
1479 void SequencerCUDA::copyAtomDataToDeviceAoS() {
1480 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1481 patchData = cpdata.ckLocalBranch();
1483 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1484 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1485 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1488 for (
int i = 0; i < integrationPatches.size(); i++) {
1489 const int numAtoms = localPatches[i].numAtoms;
1490 if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
1491 iout <<
iERROR <<
"The number of atoms in patch " << i <<
" is " 1492 << numAtoms <<
", greater than the limit for GPU atom migration (" 1493 << MigrationCUDAKernel::kMaxAtomsPerPatch <<
").\n" <<
endi;
1494 NAMD_bug(
"NAMD has stopped simulating due to the error above, " 1495 "but you could disable GPUAtomMigration and try again.\n");
1497 const int offset = localPatches[i].bufferOffset;
1498 HomePatch *patch = integrationPatches[i];
1500 copy_HtoD<FullAtom>((
FullAtom*)h_atomdata.
begin(), d_atomdata_AoS_in + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
1502 cudaCheck(cudaStreamSynchronize(stream));
1512 void SequencerCUDA::copyAtomDataToDevice(
bool copyForces,
int maxForceNumber) {
1514 AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass,
double, stream);
1516 switch (maxForceNumber) {
1518 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_x,
double, stream);
1519 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_y,
double, stream);
1520 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_z,
double, stream);
1522 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_x,
double, stream);
1523 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_y,
double, stream);
1524 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_z,
double, stream);
1526 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_x,
double, stream);
1527 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_y,
double, stream);
1528 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_z,
double, stream);
1532 AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_x,
double, stream);
1533 AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_y,
double, stream);
1534 AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_z,
double, stream);
1535 AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_x,
double, stream);
1536 AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_y,
double, stream);
1537 AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_z,
double, stream);
1538 AGGREGATE_HOME_ATOMS_TO_DEVICE(mass,
float, stream);
1539 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(charge,
float, stream);
1541 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(
partition,
int, stream);
1545 AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam,
float, stream);
1546 AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2,
float, stream);
1547 AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2,
float, stream);
1550 AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize,
int, stream);
1551 AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed,
int, stream);
1553 AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed,
int, stream);
1554 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x,
double, stream);
1555 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y,
double, stream);
1556 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z,
double, stream);
1558 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(sortOrder,
int, stream);
1559 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(unsortOrder,
int, stream);
1560 AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength,
float, stream);
1563 AGGREGATE_HOME_ATOMS_TO_DEVICE(
id,
int, stream);
1565 CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
1570 void SequencerCUDA::migrationLocalPost(
int startup) {
1571 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1572 patchData = cpdata.ckLocalBranch();
1576 CUDAMigrationKernel->transformMigratedPositions(
1578 patchData->devData[deviceIndex].d_localPatches,
1587 CUDAMigrationKernel->sortSolventAtoms(
1589 patchData->devData[deviceIndex].d_localPatches,
1598 double tempFactor = 1.0;
1603 tempFactor = (lesReduceTemp ? 1. /
simParams->lesFactor : 1);
1605 CUDAMigrationKernel->copy_AoS_to_SoA(
1607 simParams->langevinOn, dt, kbT, tempFactor,
1608 patchData->devData[deviceIndex].d_localPatches,
1611 d_vel_x, d_vel_y, d_vel_z,
1612 d_pos_x, d_pos_y, d_pos_z,
1615 d_hydrogenGroupSize, d_migrationGroupSize,
1630 CUDASequencerKernel->SetAtomIndexOrder(d_idMig, d_idOrder, numAtomsHome, stream);
1635 copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
1636 copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
1637 copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
1641 myLatticeOld = myLattice;
1644 CUDASequencerKernel->centerOfMass(
1645 d_pos_x, d_pos_y, d_pos_z,
1646 d_rcm_x, d_rcm_y, d_rcm_z,
1647 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1648 CUDASequencerKernel->centerOfMass(
1649 d_vel_x, d_vel_y, d_vel_z,
1650 d_vcm_x, d_vcm_y, d_vcm_z,
1651 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1653 cudaCheck(cudaStreamSynchronize(stream));
1656 void SequencerCUDA::migrationUpdateAdvancedFeatures(
const int startup) {
1667 for (
int i = 0; i < numPatchesHome; i++) {
1669 const int numPatchAtoms = current.
numAtoms;
1671 for(
int j = 0; j < numPatchAtoms; j++){
1676 offset += numPatchAtoms;
1678 copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
1683 lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
1686 restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1689 SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1692 groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1696 gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
1699 consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1705 void SequencerCUDA::migrationUpdateDestination() {
1706 CUDAMigrationKernel->updateMigrationDestination(
1708 d_migrationDestination,
1709 d_peer_sortSoluteIndex,
1714 bool SequencerCUDA::copyPatchData(
1718 bool reallocated =
false;
1720 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1721 patchData = cpdata.ckLocalBranch();
1723 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1725 std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1726 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1730 numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1731 numPatchesHome = homePatches.size();
1732 patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;
1736 allocate_device<FullAtom>(&d_atomdata_AoS_in, ((int64_t) numPatchesHome) * MigrationCUDAKernel::kMaxAtomsPerPatch);
1737 allocate_device<int>(&d_sortSoluteIndex, numPatchesHome * MigrationCUDAKernel::kMaxAtomsPerPatch);
1738 d_migrationDestination = NULL;
1740 #if defined(NAMD_HIP) 1741 hipExtMallocWithFlags((
void**)&patchData->devData[deviceIndex].d_localPatches,
1743 hipDeviceMallocFinegrained);
1744 hipExtMallocWithFlags((
void**)&patchData->devData[deviceIndex].d_peerPatches,
1746 hipDeviceMallocFinegrained);
1748 allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
1749 allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
1752 CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
1755 copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1756 numPatchesHomeAndProxy, stream);
1757 copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1758 peerPatches.size(), stream);
1759 if(
true || mGpuOn) {
1760 this->assembleOrderedPatchList();
1762 this->copySettleParameter();
1764 for (
int i = 0; i < numPatchesHome; i++) {
1766 this->copyMigrationInfo(patch, i);
1770 patchToDeviceMap[patch->
getPatchID()] = deviceIndex;
1772 copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
1773 copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);
1779 if (i == deviceIndex)
continue;
1780 std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
1781 for (
int j = 0; j < otherPatches.size(); j++) {
1787 copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
1788 copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
1789 patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
1790 patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;
1793 allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
1796 for (
int i = 0; i < numPatchesHomeAndProxy; i++) {
1797 HomePatch *patch = patchListHomeAndProxy[i];
1798 awayDists[i].x = patch->aAwayDist;
1799 awayDists[i].y = patch->bAwayDist;
1800 awayDists[i].z = patch->cAwayDist;
1806 copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
1807 copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
1808 copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
1809 copy_HtoD<double3>(patchCenter, d_patchCenter, numPatchesHomeAndProxy, stream);
1811 const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
1812 localPatches[numPatchesHomeAndProxy-1].numAtoms;
1814 const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
1815 localPatches[numPatchesHome-1].numAtoms;
1817 reallocated = reallocateArrays(homeAtomCount, totalAtomCount);
1820 numAtomsHomePrev = numAtomsHome;
1821 numAtomsHomeAndProxy = totalAtomCount;
1822 numAtomsHome = homeAtomCount;
1824 patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;
1827 copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1828 numPatchesHomeAndProxy, stream);
1829 copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1830 peerPatches.size(), stream);
1836 copy_HtoD<int>(molecule->
moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
1844 void SequencerCUDA::copyDataToPeers(
1847 if (!copyIn)
return;
1852 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1853 patchData = cpdata.ckLocalBranch();
1855 CUDAMigrationKernel->copyDataToProxies(
1858 numPatchesHomeAndProxy,
1859 patchData->devData[deviceIndex].d_localPatches,
1871 cudaCheck(cudaStreamSynchronize(stream));
1874 void SequencerCUDA::migrationSortAtomsNonbonded() {
1875 CUDAMigrationKernel->sortAtoms(
1876 numPatchesHome, numAtomsHome,
1877 patchData->devData[deviceIndex].d_localPatches,
1879 d_pos_x, d_pos_y, d_pos_z,
1887 void SequencerCUDA::maximumMove(
1888 const double maxvel2,
1891 CUDASequencerKernel->maximumMove(
1892 maxvel2, d_vel_x, d_vel_y, d_vel_z,
1893 killme, numAtoms, stream);
1896 void SequencerCUDA::submitHalf(
1897 int numAtoms,
int part,
const bool doEnergy)
1903 Tensor reduction_intVirialNormal;
1909 cudaCheck(cudaEventRecord(eventStart,stream));
1911 CUDASequencerKernel->submitHalf(
1913 d_vel_x, d_vel_y, d_vel_z,
1914 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
1915 d_kineticEnergy, d_intKineticEnergy,
1916 d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
1917 virial_half, intVirialNormal_half,
1918 d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
1920 cudaCheck(cudaEventRecord(eventStop, stream));
1921 cudaCheck(cudaEventSynchronize(eventStop));
1922 cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
1923 fprintf(stderr,
"submitHalf total elapsed time: %f\n", t_submitHalf);
1924 t_submitReductions2 = 0;
1929 void SequencerCUDA::submitReductions(
1933 int marginViolations,
1936 int numAtomsReduction,
1945 CUDASequencerKernel->submitReduction1(
1946 d_pos_x, d_pos_y, d_pos_z,
1947 d_vel_x, d_vel_y, d_vel_z, d_mass,
1949 d_momentum_x, d_momentum_y, d_momentum_z,
1950 d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
1951 origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
1952 momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
1953 numAtomsReduction, stream);
1955 Tensor regintVirialNormal;
1956 Tensor regintVirialNbond;
1960 cudaCheck(cudaEventRecord(eventStart,stream));
1962 CUDASequencerKernel->submitReduction2(
1964 d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
1965 d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
1966 d_f_normal_x, d_f_normal_y, d_f_normal_z,
1967 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1968 d_f_slow_x, d_f_slow_y, d_f_slow_z,
1969 d_mass, d_hydrogenGroupSize,
1970 d_kineticEnergy, kineticEnergy,
1971 d_intKineticEnergy, intKineticEnergy,
1972 d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
1973 intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
1974 d_tbcatomic, numAtomsReduction, maxForceNumber,
simParams->isMultiTimeStepping(), stream);
1977 CUDASequencerKernel->calcFixVirial(
1978 maxForceNumber, numAtomsReduction, d_atomFixed,
1979 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
1980 d_f_normal_x, d_f_normal_y, d_f_normal_z,
1981 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1982 d_f_slow_x, d_f_slow_y, d_f_slow_z,
1983 d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
1984 d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
1988 cudaCheck(cudaEventRecord(eventStop, stream));
1989 cudaCheck(cudaEventSynchronize(eventStop));
1990 cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
1991 fprintf(stderr,
"submitReductions2 total elapsed time: %f\n", t_submitReductions2);
1992 t_submitReductions2 = 0;
1996 void SequencerCUDA::copySettleParameter(){
2003 for(
int i = 0; i < patchList.size(); i++){
2004 if(patchList[i]->settle_initialized) {
2005 patch = patchList[i];
2011 h_sp.
mO = patch->settle_mO;
2012 h_sp.
mH = patch->settle_mH;
2013 h_sp.
mOrmT = patch->settle_mOrmT;
2014 h_sp.
mHrmT = patch->settle_mHrmT;
2015 h_sp.
rra = patch->settle_rra;
2016 h_sp.
ra = patch->settle_ra;
2017 h_sp.
rb = patch->settle_rb;
2018 h_sp.
rc = patch->settle_rc;
2019 h_sp.
r_om = patch->r_om;
2020 h_sp.
r_ohc = patch->r_ohc;
2024 copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
2030 void SequencerCUDA::startRun1(
2035 myLattice = lattice;
2038 CUDASequencerKernel->rattle1(
2040 numAtomsHome, 0.f, 0.f,
2042 d_vel_x, d_vel_y, d_vel_z,
2043 d_pos_x, d_pos_y, d_pos_z,
2044 d_velNew_x, d_velNew_y, d_velNew_z,
2045 d_posNew_x, d_posNew_y, d_posNew_z,
2046 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2047 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2048 &settleList, settleListSize, &d_consFailure,
2049 d_consFailureSize, &rattleList, rattleListSize,
2051 d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2052 buildRigidLists, consFailure,
simParams->watmodel, stream);
2054 this->copyPositionsAndVelocitiesToHost(1, 0);
2057 printSOAPositionsAndVelocities();
2061 void SequencerCUDA::startRun2(
2082 cudaCheck(cudaMemset(d_f_nbond_x, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2083 cudaCheck(cudaMemset(d_f_nbond_y, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2084 cudaCheck(cudaMemset(d_f_nbond_z, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2086 cudaCheck(cudaMemset(d_f_slow_x, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2087 cudaCheck(cudaMemset(d_f_slow_y, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2088 cudaCheck(cudaMemset(d_f_slow_z, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2090 CUDASequencerKernel->accumulateForceToSOA(
2094 numPatchesHomeAndProxy,
2096 patchData->devData[deviceIndex].d_localPatches,
2097 patchData->devData[deviceIndex].f_bond,
2098 patchData->devData[deviceIndex].f_bond_nbond,
2099 patchData->devData[deviceIndex].f_bond_slow,
2100 patchData->devData[deviceIndex].forceStride,
2101 patchData->devData[deviceIndex].f_nbond,
2102 patchData->devData[deviceIndex].f_nbond_slow,
2103 patchData->devData[deviceIndex].f_slow,
2118 patchData->d_queues,
2119 patchData->d_queueCounters,
2127 SMDKernel->computeCOMMGpu(myLattice, d_mass, d_pos_x, d_pos_y, d_pos_z,
2128 d_transform, stream);
2130 if(groupRestraintsKernel)
2132 groupRestraintsKernel->doCOM_mgpu(myLattice, d_transform,
2133 d_mass, d_pos_x, d_pos_y, d_pos_z,
2140 printSOAPositionsAndVelocities();
2144 void SequencerCUDA::startRun3(
2149 const bool requestGlobalForces,
2150 int doGlobalMasterStaleForces,
2151 const bool requestForcesOutput,
2152 const bool requestGlobalForcesGPU,
2155 const bool doFixed =
simParams->fixedAtomsOn;
2160 std::vector<int> atom_counts;
2162 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2164 CUDASequencerKernel->mergeForcesFromPeers(
2168 numPatchesHomeAndProxy,
2181 patchData->devData[deviceIndex].d_localPatches,
2182 patchData->devData[deviceIndex].d_peerPatches,
2199 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2200 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
2203 if(doGlobalMasterStaleForces)
2205 memset(&extVirial[EXT_GLOBALMTS], 0,
sizeof(
cudaTensor));
2206 memset(&extForce[EXT_GLOBALMTS], 0,
sizeof(double3));
2207 computeGlobalMasterVirial(
2208 numPatchesHomeAndProxy,
2210 patchData->devData[deviceIndex].d_localPatches,
2218 &d_extForce[EXT_GLOBALMTS],
2219 &extForce[EXT_GLOBALMTS],
2220 &d_extVirial[EXT_GLOBALMTS],
2221 &extVirial[EXT_GLOBALMTS],
2228 calculateExternalForces(
simParams->firstTimestep, maxForceNumber, 1, 1);
2231 if(
true || deviceID == 0){
2233 snprintf(prefix, 10,
"step-%d",0);
2234 this->printSOAForces(prefix);
2238 CUDASequencerKernel->addForceToMomentum(
2239 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2241 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2242 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2243 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2244 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2245 numAtomsHome, maxForceNumber, stream);
2247 CUDASequencerKernel->rattle1(
2249 numAtomsHome, -dt_normal, -1.0/(dt_normal),
2251 d_vel_x, d_vel_y, d_vel_z,
2252 d_pos_x, d_pos_y, d_pos_z,
2253 d_velNew_x, d_velNew_y, d_velNew_z,
2254 d_posNew_x, d_posNew_y, d_posNew_z,
2255 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2256 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2257 &settleList, settleListSize, &d_consFailure,
2258 d_consFailureSize, &rattleList, rattleListSize,
2260 d_rigidVirial, rigidVirial, d_tbcatomic,
true, sp,
2261 true, consFailure,
simParams->watmodel, stream);
2263 CUDASequencerKernel->centerOfMass(
2264 d_vel_x, d_vel_y, d_vel_z,
2265 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2266 d_hydrogenGroupSize, numAtomsHome, stream);
2271 submitHalf(numAtomsHome, 1, 1);
2273 cudaCheck(cudaStreamSynchronize(stream));
2275 Tensor reduction_intVirialNormal;
2280 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2281 reduction_virial *= 0.5;
2285 += (intKineticEnergy_half[0] * 0.25);
2286 reduction_intVirialNormal *= 0.5;
2288 reduction_intVirialNormal);
2291 CUDASequencerKernel->addForceToMomentum(
2292 doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
2294 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2295 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2296 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2297 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2298 numAtomsHome, maxForceNumber, stream);
2300 CUDASequencerKernel->rattle1(
2302 numAtomsHome, dt_normal, 1.0/dt_normal,
2304 d_vel_x, d_vel_y, d_vel_z,
2305 d_pos_x, d_pos_y, d_pos_z,
2306 d_velNew_x, d_velNew_y, d_velNew_z,
2307 d_posNew_x, d_posNew_y, d_posNew_z,
2308 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2309 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2310 &settleList, settleListSize, &d_consFailure,
2311 d_consFailureSize, &rattleList, rattleListSize,
2313 d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2314 buildRigidLists, consFailure,
simParams->watmodel, stream);
2316 CUDASequencerKernel->centerOfMass(
2317 d_vel_x, d_vel_y, d_vel_z,
2318 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2319 d_hydrogenGroupSize, numAtomsHome, stream);
2323 submitHalf(numAtomsHome, 1, 1);
2325 cudaCheck(cudaStreamSynchronize(stream));
2327 Tensor reduction_intVirialNormal;
2332 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2333 reduction_virial *= 0.5;
2337 += (intKineticEnergy_half[0] * 0.25);
2338 reduction_intVirialNormal *= 0.5;
2340 reduction_intVirialNormal);
2343 CUDASequencerKernel->addForceToMomentum(
2344 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2346 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2347 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2348 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2349 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2350 numAtomsHome, maxForceNumber, stream);
2352 if(requestGlobalForces || requestForcesOutput) {
2355 saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
2358 if (requestGlobalForcesGPU) {
2359 if (d_f_saved_nbond_x ==
nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
2360 if (d_f_saved_nbond_y ==
nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
2361 if (d_f_saved_nbond_z ==
nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
2362 if (d_f_saved_slow_x ==
nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
2363 if (d_f_saved_slow_y ==
nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
2364 if (d_f_saved_slow_z ==
nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
2365 CUDASequencerKernel->copyForcesToDevice(
2366 numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2367 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2368 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
2369 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
2373 CUDASequencerKernel->centerOfMass(
2374 d_vel_x, d_vel_y, d_vel_z,
2375 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2376 d_hydrogenGroupSize, numAtomsHome, stream);
2378 submitReductions(origin.
x, origin.
y, origin.
z,
2379 marginViolations, 1,
2381 numAtomsHome, maxForceNumber);
2383 copyPositionsAndVelocitiesToHost(1, 0);
2391 NAMD_die(
"constraint failure during CUDA rattle!\n");
2393 iout <<
iWARN <<
"constraint failure during CUDA rattle!\n" <<
endi;
2396 cudaCheck(cudaStreamSynchronize(stream));
2397 if (doGlobalMasterStaleForces) {
2398 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
2399 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
2403 Tensor reduction_rigidVirial;
2406 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
2412 Vector momentum(*momentum_x, *momentum_y, *momentum_z);
2414 Vector angularMomentum(*angularMomentum_x,
2416 *angularMomentum_z);
2419 Tensor regintVirialNormal;
2420 Tensor regintVirialNbond;
2423 if (maxForceNumber >= 1) {
2426 if (maxForceNumber >= 2) {
2436 cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
2437 double3 fixForceNormal, fixForceNbond, fixForceSlow;
2438 switch (maxForceNumber) {
2440 copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
2441 copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
2445 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
2448 copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
2449 copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
2453 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
2456 copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
2457 copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
2461 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
2465 auto printTensor = [](
const cudaTensor& t,
const std::string& name){
2466 CkPrintf(
"%s", name.c_str());
2467 CkPrintf(
"\n%12.5lf %12.5lf %12.5lf\n" 2468 "%12.5lf %12.5lf %12.5lf\n" 2469 "%12.5lf %12.5lf %12.5lf\n",
2474 printTensor(fixVirialNormal,
"fixVirialNormal = ");
2475 printTensor(fixVirialNbond,
"fixVirialNbond = ");
2476 printTensor(fixVirialSlow,
"fixVirialSlow = ");
2484 this->printSOAForces(NULL);
2489 printSOAPositionsAndVelocities();
2493 void SequencerCUDA::monteCarloPressure_reject(
Lattice &lattice)
2496 myLattice = lattice;
2500 temp = d_f_normal_x; d_f_normal_x = d_f_normalMC_x; d_f_normalMC_x = temp;
2501 temp = d_f_normal_y; d_f_normal_y = d_f_normalMC_y; d_f_normalMC_y = temp;
2502 temp = d_f_normal_z; d_f_normal_z = d_f_normalMC_z; d_f_normalMC_z = temp;
2503 temp = d_f_nbond_x; d_f_nbond_x = d_f_nbondMC_x; d_f_nbondMC_x = temp;
2504 temp = d_f_nbond_y; d_f_nbond_y = d_f_nbondMC_y; d_f_nbondMC_y = temp;
2505 temp = d_f_nbond_z; d_f_nbond_z = d_f_nbondMC_z; d_f_nbondMC_z = temp;
2506 temp = d_f_slow_x; d_f_slow_x = d_f_slowMC_x; d_f_slowMC_x = temp;
2507 temp = d_f_slow_y; d_f_slow_y = d_f_slowMC_y; d_f_slowMC_y = temp;
2508 temp = d_f_slow_z; d_f_slow_z = d_f_slowMC_z; d_f_slowMC_z = temp;
2509 #ifdef NAMD_NCCL_ALLREDUCE 2511 temp = d_posNew_x; d_posNew_x = d_posMC_x; d_posMC_x = temp;
2512 temp = d_posNew_y; d_posNew_y = d_posMC_y; d_posMC_y = temp;
2513 temp = d_posNew_z; d_posNew_z = d_posMC_z; d_posMC_z = temp;
2515 temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2516 temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2517 temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2520 temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2521 temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2522 temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2526 void SequencerCUDA::monteCarloPressure_accept(
2527 const int doMigration)
2530 CUDASequencerKernel->centerOfMass(
2531 d_pos_x, d_pos_y, d_pos_z,
2532 d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2533 d_hydrogenGroupSize, numAtomsHome, stream);
2538 Tensor reduction_intVirialNormal;
2542 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2543 reduction_virial *= 0.5;
2544 reduction_intVirialNormal *= 0.5;
2547 reduction_intVirialNormal);
2554 myLatticeOld = myLattice;
2558 void SequencerCUDA::monteCarloPressure_part1(
2565 copy_DtoD<double>(d_f_normal_x, d_f_normalMC_x, numAtomsHome, stream);
2566 copy_DtoD<double>(d_f_normal_y, d_f_normalMC_y, numAtomsHome, stream);
2567 copy_DtoD<double>(d_f_normal_z, d_f_normalMC_z, numAtomsHome, stream);
2568 copy_DtoD<double>(d_f_nbond_x, d_f_nbondMC_x, numAtomsHome, stream);
2569 copy_DtoD<double>(d_f_nbond_y, d_f_nbondMC_y, numAtomsHome, stream);
2570 copy_DtoD<double>(d_f_nbond_z, d_f_nbondMC_z, numAtomsHome, stream);
2571 copy_DtoD<double>(d_f_slow_x, d_f_slowMC_x, numAtomsHome, stream);
2572 copy_DtoD<double>(d_f_slow_y, d_f_slowMC_y, numAtomsHome, stream);
2573 copy_DtoD<double>(d_f_slow_z, d_f_slowMC_z, numAtomsHome, stream);
2574 #ifdef NAMD_NCCL_ALLREDUCE 2577 copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
2578 copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
2579 copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
2582 copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2583 copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2584 copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2588 copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2589 copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2590 copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2595 Lattice newLattice = oldLattice;
2604 CUDASequencerKernel->scaleCoordinateUsingGC(
2605 d_pos_x, d_pos_y, d_pos_z, d_idOrder, d_moleculeStartIndex,
2606 d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
2611 myLattice = newLattice;
2616 bool doNbond = patchData->flags.doNonbonded;
2617 bool doSlow = patchData->flags.doFullElectrostatics;
2620 bool doAlchDecouple =
false;
2621 bool doAlchSoftCore =
false;
2624 if (
simParams->alchThermIntOn) doTI =
true;
2625 if (
simParams->alchDecouple) doAlchDecouple =
true;
2626 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2630 lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
2633 bool usePatchPme =
false;
2646 getNumPatchesHome(),
2655 std::vector<int> atom_counts;
2657 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2659 CUDASequencerKernel->set_compute_positions(
2663 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
2664 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
2665 #ifdef NAMD_NCCL_ALLREDUCE 2666 (mGpuOn) ? d_posNew_x: d_pos_x,
2667 (mGpuOn) ? d_posNew_y: d_pos_y,
2668 (mGpuOn) ? d_posNew_z: d_pos_z,
2679 d_charge, d_partition, charge_scaling,
2681 patchData->devData[deviceIndex].slow_patchPositions,
2682 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
2683 d_sortOrder, newLattice,
2684 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
2685 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
2687 patchData->devData[deviceIndex].d_localPatches,
2688 patchData->devData[deviceIndex].d_peerPatches,
2692 cudaCheck(cudaStreamSynchronize(stream));
2695 void SequencerCUDA::monteCarloPressure_part2(
2698 const bool doEnergy,
2699 const bool doGlobal,
2700 const bool doVirial)
2706 #ifdef NAMD_NCCL_ALLREDUCE 2707 cudaCheck(cudaMemset(d_f_raw, 0,
sizeof(
double)*numAtoms*3*(maxForceNumber+1)));
2711 CUDASequencerKernel->accumulateForceToSOA(
2715 numPatchesHomeAndProxy,
2717 patchData->devData[deviceIndex].d_localPatches,
2718 patchData->devData[deviceIndex].f_bond,
2719 patchData->devData[deviceIndex].f_bond_nbond,
2720 patchData->devData[deviceIndex].f_bond_slow,
2721 patchData->devData[deviceIndex].forceStride,
2722 patchData->devData[deviceIndex].f_nbond,
2723 patchData->devData[deviceIndex].f_nbond_slow,
2724 patchData->devData[deviceIndex].f_slow,
2739 patchData->d_queues,
2740 patchData->d_queueCounters,
2745 #ifndef NAMD_NCCL_ALLREDUCE 2749 std::vector<int> atom_counts;
2751 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2753 CUDASequencerKernel->mergeForcesFromPeers(
2757 numPatchesHomeAndProxy,
2770 patchData->devData[deviceIndex].d_localPatches,
2771 patchData->devData[deviceIndex].d_peerPatches,
2776 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2777 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
2781 calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
2784 if(
true || deviceID == 0){
2786 snprintf(prefix, 10,
"step-%d",step);
2787 this->printSOAForces(prefix);
2793 void SequencerCUDA::setRescalePairlistTolerance(
const bool val) {
2794 rescalePairlistTolerance = val;
2797 void SequencerCUDA::launch_part1(
2802 double velrescaling,
2803 const double maxvel2,
2807 int reassignVelocitiesStep,
2808 int langevinPistonStep,
2809 int berendsenPressureStep,
2812 const int savePairlists,
2813 const int usePairlists,
2814 const bool doEnergy)
2819 this->maxvel2 = maxvel2;
2821 const bool doFixed =
simParams->fixedAtomsOn;
2824 myLattice = lattice;
2825 if(reassignVelocitiesStep)
2827 const int reassignFreq =
simParams->reassignFreq;
2829 newTemp += ( step / reassignFreq ) *
simParams->reassignIncr;
2834 if ( newTemp < simParams->reassignHold )
2839 CUDASequencerKernel->reassignVelocities(
2840 dt_normal,
simParams->fixedAtomsOn, d_atomFixed,
2841 d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
2842 d_vel_x, d_vel_y, d_vel_z,
2844 numAtomsHome, numAtomsHome, 0,
2849 if(berendsenPressureStep) {
2854 CUDASequencerKernel->scaleCoordinateWithFactor(
2855 d_pos_x, d_pos_y, d_pos_z, d_mass, d_hydrogenGroupSize,
2856 cuFactor, cuOrigin,
simParams->useGroupPressure, numAtomsHome, stream);
2859 if(!langevinPistonStep){
2862 CUDASequencerKernel->velocityVerlet1(
2863 doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
2864 dt_slow, velrescaling, d_recipMass,
2865 d_vel_x, d_vel_y, d_vel_z, maxvel2, killme, d_pos_x, d_pos_y, d_pos_z,
2866 pos_x, pos_y, pos_z, d_f_normal_x, d_f_normal_y, d_f_normal_z,
2867 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z,
2868 d_atomFixed, numAtomsHome, maxForceNumber, stream);
2871 CUDASequencerKernel->addForceToMomentum(
2872 doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
2874 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2875 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2876 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2877 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2878 numAtomsHome, maxForceNumber, stream);
2880 maximumMove(maxvel2, numAtomsHome);
2889 CUDASequencerKernel->addVelocityToPosition(
2890 simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2891 d_vel_x, d_vel_y, d_vel_z,
2892 d_pos_x, d_pos_y, d_pos_z,
2893 pos_x, pos_y, pos_z, numAtomsHome,
false, stream);
2894 CUDASequencerKernel->langevinPiston(
2896 d_groupFixed, d_transform, lattice,
2897 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
2898 d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
2899 d_mass, d_hydrogenGroupSize,
2900 cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
2901 simParams->useGroupPressure, numAtomsHome, stream);
2902 CUDASequencerKernel->addVelocityToPosition(
2903 simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2904 d_vel_x, d_vel_y, d_vel_z,
2905 d_pos_x, d_pos_y, d_pos_z,
2906 pos_x, pos_y, pos_z, numAtomsHome,
false, stream);
2908 if(mGpuOn && SMDKernel)
2911 SMDKernel->computeCOMMGpu(lattice, d_mass, d_pos_x, d_pos_y, d_pos_z,
2912 d_transform, stream);
2914 if(mGpuOn && groupRestraintsKernel)
2916 groupRestraintsKernel->doCOM_mgpu(lattice, d_transform,
2917 d_mass, d_pos_x, d_pos_y, d_pos_z,
2922 if( (doEnergy || doVirial) ) {
2923 CUDASequencerKernel->centerOfMass(
2924 d_pos_x, d_pos_y, d_pos_z,
2925 d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2926 d_hydrogenGroupSize, numAtomsHome, stream);
2927 CUDASequencerKernel->centerOfMass(
2928 d_vel_x, d_vel_y, d_vel_z,
2929 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2930 d_hydrogenGroupSize, numAtomsHome, stream);
2936 bool doNbond = patchData->flags.doNonbonded;
2937 bool doSlow = patchData->flags.doFullElectrostatics;
2941 bool doAlchDecouple =
false;
2942 bool doAlchSoftCore =
false;
2945 if (
simParams->alchThermIntOn) doTI =
true;
2946 if (
simParams->alchDecouple) doAlchDecouple =
true;
2947 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2949 if ( ! savePairlists ) {
2952 double sysdima = lattice.
a_r().
unit() * lattice.
a();
2953 double sysdimb = lattice.
b_r().
unit() * lattice.
b();
2954 double sysdimc = lattice.
c_r().
unit() * lattice.
c();
2957 CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
2958 patchData->devData[deviceIndex].d_localPatches,
2959 d_pos_x, d_pos_y, d_pos_z, d_posSave_x, d_posSave_y, d_posSave_z,
2961 myLattice, myLatticeOld,
2962 d_patchMin, d_patchMax, d_patchCenter,
2964 d_tbcatomic,
simParams->pairlistTrigger,
2966 d_patchMaxAtomMovement, patchMaxAtomMovement,
2967 d_patchNewTolerance, patchNewTolerance,
2968 minSize,
simParams->cutoff, sysdima, sysdimb, sysdimc,
2970 h_periodicCellSmall,
2971 rescalePairlistTolerance,
2972 isPeriodic, stream);
2973 rescalePairlistTolerance =
false;
2976 rescalePairlistTolerance =
true;
2981 cudaCheck(cudaStreamSynchronize(stream));
2985 void SequencerCUDA::launch_part11(
2989 double velrescaling,
2990 const double maxvel2,
2994 int langevinPistonStep,
2997 const int savePairlists,
2998 const int usePairlists,
2999 const bool doEnergy)
3001 const bool doVirial =
simParams->langevinPistonOn;
3005 bool doNbond = patchData->flags.doNonbonded;
3006 bool doSlow = patchData->flags.doFullElectrostatics;
3010 bool doAlchDecouple =
false;
3011 bool doAlchSoftCore =
false;
3014 if (
simParams->alchThermIntOn) doTI =
true;
3015 if (
simParams->alchDecouple) doAlchDecouple =
true;
3016 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
3019 submitHalf(numAtomsHome, 1, doEnergy || doVirial);
3023 this->update_patch_flags();
3026 finish_part1(copyIn, patchList[0]->flags.savePairlists,
3027 patchList[0]->flags.usePairlists);
3031 void SequencerCUDA::launch_set_compute_positions() {
3036 bool doNbond = patchData->flags.doNonbonded;
3037 bool doSlow = patchData->flags.doFullElectrostatics;
3041 bool doAlchDecouple =
false;
3042 bool doAlchSoftCore =
false;
3045 if (
simParams->alchThermIntOn) doTI =
true;
3046 if (
simParams->alchDecouple) doAlchDecouple =
true;
3047 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
3052 lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3055 bool usePatchPme =
false;
3068 getNumPatchesHome(),
3081 std::vector<int> atom_counts;
3083 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3085 CUDASequencerKernel->set_compute_positions(
3089 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3090 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3091 #ifdef NAMD_NCCL_ALLREDUCE 3092 (mGpuOn) ? d_posNew_x: d_pos_x,
3093 (mGpuOn) ? d_posNew_y: d_pos_y,
3094 (mGpuOn) ? d_posNew_z: d_pos_z,
3105 d_charge, d_partition, charge_scaling,
3107 patchData->devData[deviceIndex].slow_patchPositions,
3108 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3109 d_sortOrder, myLattice,
3110 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3111 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3113 patchData->devData[deviceIndex].d_localPatches,
3114 patchData->devData[deviceIndex].d_peerPatches,
3121 copyPositionsToHost_direct();
3128 void SequencerCUDA:: finish_part1(
const int copyIn,
3129 const int savePairlists,
3130 const int usePairlists)
3141 cudaCheck(cudaStreamSynchronize(stream));
3144 if(*h_periodicCellSmall){
3145 NAMD_die(
"Periodic cell has become too small for original patch grid!\n" 3146 "Possible solutions are to restart from a recent checkpoint,\n" 3147 "increase margin, or disable useFlexibleCell for liquid simulation.");
3155 double *vel_x, *vel_y, *vel_z;
3156 std::vector<int> id;
3157 std::vector<int> patchIDofAtoms(numAtomsHome);
3158 allocate_host<double>(&vel_x, numAtomsHome);
3159 allocate_host<double>(&vel_y, numAtomsHome);
3160 allocate_host<double>(&vel_z, numAtomsHome);
3161 copy_DtoH_sync<double>(d_vel_x, vel_x, numAtomsHome);
3162 copy_DtoH_sync<double>(d_vel_y, vel_y, numAtomsHome);
3163 copy_DtoH_sync<double>(d_vel_z, vel_z, numAtomsHome);
3167 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3169 for (
int i = 0; i < numPatchesHome; i++) {
3172 const int numPatchAtoms = current.
numAtoms;
3173 id.resize(numPatchAtoms +
id.size());
3174 for(
int j = 0; j < numPatchAtoms; j++){
3176 id[offset + j] = current.
id[j];
3178 patchIDofAtoms[offset + j] = patch->
getPatchID();
3180 offset += numPatchAtoms;
3183 id.resize(numAtomsHome);
3184 copy_DtoH_sync<int>(d_idMig,
id.data(), numAtomsHome);
3187 for (
int i=0; i < numAtomsHome; i++) {
3189 vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3190 if (vel2 > maxvel2) {
3201 <<
" in patch " << patchIDofAtoms[i]
3202 <<
" on PE " << CkMyPe()
3203 <<
" with " << patchList[globalToLocalID[patchIDofAtoms[i]]]->patchDataSOA.numAtoms
3204 <<
" atoms)\n" <<
endi;
3207 iout <<
iERROR <<
"Atoms moving too fast at timestep " << patchList[0]->flags.step <<
3208 "; simulation has become unstable (" 3209 << cnt <<
" atoms on pe " << CkMyPe() <<
", GPU " << deviceID <<
").\n" <<
endi;
3211 double *pos_x, *pos_y, *pos_z;
3212 allocate_host<double>(&pos_x, numAtomsHome);
3213 allocate_host<double>(&pos_y, numAtomsHome);
3214 allocate_host<double>(&pos_z, numAtomsHome);
3215 copy_DtoH_sync<double>(d_pos_x, pos_x, numAtomsHome);
3216 copy_DtoH_sync<double>(d_pos_y, pos_y, numAtomsHome);
3217 copy_DtoH_sync<double>(d_pos_z, pos_z, numAtomsHome);
3219 const std::string outfilename =
3220 std::string(
simParams->crashFilename) +
"." +
3221 std::to_string(deviceIndex);
3222 std::ofstream ofs_crash_dump(outfilename.c_str());
3223 ofs_crash_dump <<
"atom,r_x,r_y,r_z,v_x,v_y,v_z\n";
3224 for (
int i=0; i < numAtomsHome; i++) {
3225 ofs_crash_dump <<
id[i]+1 <<
"," 3233 ofs_crash_dump.flush();
3234 ofs_crash_dump.close();
3235 iout <<
iWARN <<
"PE " << CkMyPe() <<
", GPU " << deviceID
3236 <<
": the atom positions and velocities have been written to " 3237 << outfilename <<
"\n" <<
endi;
3238 deallocate_host<double>(&pos_x);
3239 deallocate_host<double>(&pos_y);
3240 deallocate_host<double>(&pos_z);
3242 deallocate_host<double>(&vel_x);
3243 deallocate_host<double>(&vel_y);
3244 deallocate_host<double>(&vel_z);
3245 NAMD_die(
"SequencerCUDA: Atoms moving too fast");
3250 Tensor reduction_intVirialNormal;
3255 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3256 reduction_virial *= 0.5;
3260 += (intKineticEnergy_half[0] * 0.25);
3261 reduction_intVirialNormal *= 0.5;
3263 reduction_intVirialNormal);
3264 int migration = (h_marginViolations[0] != 0) ? 1 :0;
3266 patchData->migrationFlagPerDevice[deviceIndex] = migration;
3267 h_marginViolations[0] = 0;
3271 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
3272 bool copyOut,
const int doGlobal){
3275 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3276 patchData = cpdata.ckLocalBranch();
3277 std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3278 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3279 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3280 const int numAtomsToCopy = numAtomsHome;
3281 copy_DtoH<double>(d_vel_x, vel_x, numAtomsToCopy, stream);
3282 copy_DtoH<double>(d_vel_y, vel_y, numAtomsToCopy, stream);
3283 copy_DtoH<double>(d_vel_z, vel_z, numAtomsToCopy, stream);
3286 copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3287 copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3288 copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3292 for(
int i = 0; i < homePatches.size(); i++){
3296 const int numPatchAtoms = localPatches[i].
numAtoms;
3297 const int offset = localPatches[i].bufferOffset;
3298 memcpy(current.
vel_x, vel_x + offset, numPatchAtoms*
sizeof(
double));
3299 memcpy(current.
vel_y, vel_y + offset, numPatchAtoms*
sizeof(
double));
3300 memcpy(current.
vel_z, vel_z + offset, numPatchAtoms*
sizeof(
double));
3303 memcpy(current.
pos_x, pos_x + offset, numPatchAtoms*
sizeof(
double));
3304 memcpy(current.
pos_y, pos_y + offset, numPatchAtoms*
sizeof(
double));
3305 memcpy(current.
pos_z, pos_z + offset, numPatchAtoms*
sizeof(
double));
3312 void SequencerCUDA::copyPositionsToHost(){
3314 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3315 patchData = cpdata.ckLocalBranch();
3316 std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3317 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3318 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3320 const int numAtomsToCopy = numAtomsHome;
3322 copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3323 copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3324 copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3327 for(
int i = 0; i < homePatches.size(); i++){
3331 const int numPatchAtoms = localPatches[i].
numAtoms;
3332 const int offset = localPatches[i].bufferOffset;
3333 memcpy(current.
pos_x, pos_x + offset, numPatchAtoms*
sizeof(
double));
3334 memcpy(current.
pos_y, pos_y + offset, numPatchAtoms*
sizeof(
double));
3335 memcpy(current.
pos_z, pos_z + offset, numPatchAtoms*
sizeof(
double));
3339 void SequencerCUDA::update_patch_flags()
3342 int pairlists = (patchData->flags.step <
simParams->N);
3343 for (
int i=0; i < numPatchesHome; i++) {
3349 void SequencerCUDA::updatePairlistFlags(
const int doMigration){
3350 int pairlists = patchList[0]->flags.step <
simParams->N;
3351 for(
int i = 0; i < numPatchesHome; i++){
3371 patch->doPairlistCheck_newTolerance *= (1 -
simParams->pairlistShrink);
3375 patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
3382 if(patchList[0]->flags.savePairlists){
3384 copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
3385 copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
3386 copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
3387 myLatticeOld = myLattice;
3391 void SequencerCUDA::finish_patch_flags(
int migration)
3393 for (
int i=0; i < numPatchesHome; i++) {
3407 void SequencerCUDA::launch_part2(
3408 const int doMCPressure,
3415 const int langevinPistonStep,
3419 const bool doEnergy)
3424 bool doNbond =
false;
3425 bool doSlow =
false;
3436 #ifdef NAMD_NCCL_ALLREDUCE 3437 cudaCheck(cudaMemset(d_f_raw, 0,
sizeof(
double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
3446 (!is_lonepairs_psf)){
3447 CUDASequencerKernel->accumulate_force_kick(
3452 numPatchesHomeAndProxy,
3453 patchData->devData[deviceIndex].d_localPatches,
3454 patchData->devData[deviceIndex].f_bond,
3455 patchData->devData[deviceIndex].f_bond_nbond,
3456 patchData->devData[deviceIndex].f_bond_slow,
3457 patchData->devData[deviceIndex].forceStride,
3458 patchData->devData[deviceIndex].f_nbond,
3459 patchData->devData[deviceIndex].f_nbond_slow,
3460 patchData->devData[deviceIndex].f_slow,
3487 CUDASequencerKernel->accumulateForceToSOA(
3491 numPatchesHomeAndProxy,
3493 patchData->devData[deviceIndex].d_localPatches,
3494 patchData->devData[deviceIndex].f_bond,
3495 patchData->devData[deviceIndex].f_bond_nbond,
3496 patchData->devData[deviceIndex].f_bond_slow,
3497 patchData->devData[deviceIndex].forceStride,
3498 patchData->devData[deviceIndex].f_nbond,
3499 patchData->devData[deviceIndex].f_nbond_slow,
3500 patchData->devData[deviceIndex].f_slow,
3515 patchData->d_queues,
3516 patchData->d_queueCounters,
3530 void SequencerCUDA::launch_part3(
3531 const int doMCPressure,
3538 const bool requestGlobalForces,
3539 const int doGlobalStaleForces,
3540 const bool forceRequestedGPU,
3543 const bool doEnergy,
3544 const bool requestForcesOutput)
3547 const bool doFixed =
simParams->fixedAtomsOn;
3548 const double velrescaling = 1;
3555 #ifndef NAMD_NCCL_ALLREDUCE 3560 std::vector<int> atom_counts;
3562 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3564 CUDASequencerKernel->mergeForcesFromPeers(
3568 numPatchesHomeAndProxy,
3581 patchData->devData[deviceIndex].d_localPatches,
3582 patchData->devData[deviceIndex].d_peerPatches,
3587 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
3588 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
3591 if(doVirial && doGlobalStaleForces)
3593 memset(&extVirial[EXT_GLOBALMTS], 0,
sizeof(
cudaTensor));
3594 memset(&extForce[EXT_GLOBALMTS], 0,
sizeof(double3));
3595 computeGlobalMasterVirial(
3596 numPatchesHomeAndProxy,
3598 patchData->devData[deviceIndex].d_localPatches,
3606 &d_extForce[EXT_GLOBALMTS],
3607 &extForce[EXT_GLOBALMTS],
3608 &d_extVirial[EXT_GLOBALMTS],
3609 &extVirial[EXT_GLOBALMTS],
3614 calculateExternalForces(step, maxForceNumber, doEnergy, doVirial);
3617 if(
true || deviceID == 0){
3619 snprintf(prefix, 10,
"step-%d",step);
3620 this->printSOAForces(prefix);
3627 CUDASequencerKernel->langevinVelocitiesBBK1(
3628 dt_normal, d_langevinParam, d_vel_x, d_vel_y, d_vel_z, numAtomsHome, stream);
3635 CUDASequencerKernel->addForceToMomentum(
3636 doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
3638 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3639 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3640 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3641 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3642 numAtomsHome, maxForceNumber, stream);
3650 CUDASequencerKernel->rattle1(
3651 simParams->fixedAtomsOn, doEnergy || doVirial,
3652 1, numAtomsHome, dt_normal, 1.0/dt_normal,
3654 d_vel_x, d_vel_y, d_vel_z,
3655 d_pos_x, d_pos_y, d_pos_z,
3656 d_velNew_x, d_velNew_y, d_velNew_z,
3657 d_posNew_x, d_posNew_y, d_posNew_z,
3658 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3659 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3660 &settleList, settleListSize, &d_consFailure,
3661 d_consFailureSize, &rattleList, rattleListSize,
3663 d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3664 buildRigidLists, consFailure,
simParams->watmodel, stream);
3665 buildRigidLists =
false;
3667 CUDASequencerKernel->langevinVelocitiesBBK2(
3668 dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
3669 d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
3670 d_vel_x, d_vel_y, d_vel_z,
3671 numAtomsHome, numAtomsHome, 0,
3675 CUDASequencerKernel->rattle1(
3676 simParams->fixedAtomsOn, doEnergy || doVirial,
3677 1, numAtomsHome, dt_normal, 1.0/dt_normal,
3679 d_vel_x, d_vel_y, d_vel_z,
3680 d_pos_x, d_pos_y, d_pos_z,
3681 d_velNew_x, d_velNew_y, d_velNew_z,
3682 d_posNew_x, d_posNew_y, d_posNew_z,
3683 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3684 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3685 &settleList, settleListSize, &d_consFailure,
3686 d_consFailureSize, &rattleList, rattleListSize,
3688 d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3689 buildRigidLists, consFailure,
simParams->watmodel, stream);
3690 buildRigidLists =
false;
3694 if(doEnergy || doVirial){
3695 CUDASequencerKernel->centerOfMass(
3696 d_vel_x, d_vel_y, d_vel_z,
3697 d_vcm_x, d_vcm_y, d_vcm_z,
3698 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
3701 submitHalf(numAtomsHome, 2, doEnergy || doVirial);
3703 CUDASequencerKernel->addForceToMomentum(
3704 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
3706 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3707 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3708 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3709 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3710 numAtomsHome, maxForceNumber, stream);
3712 if(requestGlobalForces || requestForcesOutput) {
3715 saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
3718 if (forceRequestedGPU) {
3719 if (d_f_saved_nbond_x ==
nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
3720 if (d_f_saved_nbond_y ==
nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
3721 if (d_f_saved_nbond_z ==
nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
3722 if (d_f_saved_slow_x ==
nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
3723 if (d_f_saved_slow_y ==
nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
3724 if (d_f_saved_slow_z ==
nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
3725 CUDASequencerKernel->copyForcesToDevice(
3726 numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3727 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3728 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
3729 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
3734 submitReductions(origin.
x, origin.
y, origin.
z,
3735 marginViolations, doEnergy || doVirial,
3736 copyOut &&
simParams->outputMomenta != 0,
3737 numAtomsHome, maxForceNumber);
3739 copyPositionsAndVelocitiesToHost(copyOut, 0);
3747 NAMD_die(
"constraint failure during CUDA rattle!\n");
3749 iout <<
iWARN <<
"constraint failure during CUDA rattle!\n" <<
endi;
3751 }
else if(doEnergy || doVirial){
3752 cudaCheck(cudaStreamSynchronize(stream));
3753 if(doVirial && doGlobalStaleForces) {
3754 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
3755 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
3758 Tensor reduction_rigidVirial;
3761 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
3767 Tensor reduction_intVirialNormal;
3772 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3773 reduction_virial *= 0.5;
3777 += (intKineticEnergy_half[0] * 0.25);
3778 reduction_intVirialNormal *= 0.5;
3780 reduction_intVirialNormal);
3784 Vector momentum(*momentum_x, *momentum_y, *momentum_z);
3786 Vector angularMomentum(*angularMomentum_x,
3788 *angularMomentum_z);
3791 Tensor regintVirialNormal;
3792 Tensor regintVirialNbond;
3795 if (maxForceNumber >= 1) {
3798 if (maxForceNumber >= 2) {
3808 cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
3809 double3 fixForceNormal, fixForceNbond, fixForceSlow;
3810 switch (maxForceNumber) {
3812 copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
3813 copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
3817 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
3820 copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
3821 copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
3825 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
3828 copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
3829 copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
3833 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
3837 auto printTensor = [](
const cudaTensor& t,
const std::string& name){
3838 CkPrintf(
"%s", name.c_str());
3839 CkPrintf(
"\n%12.5lf %12.5lf %12.5lf\n" 3840 "%12.5lf %12.5lf %12.5lf\n" 3841 "%12.5lf %12.5lf %12.5lf\n",
3846 printTensor(fixVirialNormal,
"fixVirialNormal = ");
3847 printTensor(fixVirialNbond,
"fixVirialNbond = ");
3848 printTensor(fixVirialSlow,
"fixVirialSlow = ");
3857 void SequencerCUDA::atomUpdatePme()
3862 bool doNbond =
false;
3867 bool doAlchDecouple =
false;
3868 bool doAlchSoftCore =
false;
3871 if (
simParams->alchThermIntOn) doTI =
true;
3872 if (
simParams->alchDecouple) doAlchDecouple =
true;
3873 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
3877 lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3880 bool usePatchPme =
false;
3893 getNumPatchesHome(),
3902 std::vector<int> atom_counts;
3904 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3906 CUDASequencerKernel->set_pme_positions(
3910 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3911 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3912 #ifdef NAMD_NCCL_ALLREDUCE 3913 (mGpuOn) ? d_posNew_x: d_pos_x,
3914 (mGpuOn) ? d_posNew_y: d_pos_y,
3915 (mGpuOn) ? d_posNew_z: d_pos_z,
3926 d_charge, d_partition, charge_scaling,
3928 patchData->devData[deviceIndex].slow_patchPositions,
3929 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3930 d_sortOrder, myLattice,
3931 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3932 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3934 patchData->devData[deviceIndex].d_localPatches,
3935 patchData->devData[deviceIndex].d_peerPatches,
3939 cudaCheck(cudaStreamSynchronize(stream));
3944 void SequencerCUDA::sync() {
3945 cudaCheck(cudaStreamSynchronize(stream));
3948 void SequencerCUDA::calculateExternalForces(
3950 const int maxForceNumber,
3952 const int doVirial) {
3957 if (is_lonepairs_psf) {
3958 lonepairsKernel->redistributeForce(
3959 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3960 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3961 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3962 d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
3963 d_pos_x, d_pos_y, d_pos_z, maxForceNumber, doEnergy || doVirial, stream);
3966 if (is_tip4_water) {
3967 redistributeTip4pForces(maxForceNumber, doEnergy || doVirial);
3977 double efield_phi =
PI/180. *
simParams->eFieldPhase;
3980 CUDASequencerKernel->apply_Efield(numAtomsHome,
simParams->eFieldNormalized,
3981 doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
3982 d_charge, d_pos_x, d_pos_y, d_pos_z,
3983 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3984 &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
3985 &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
3986 &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
3987 d_tbcatomic, stream);
3991 restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
3992 d_pos_x, d_pos_y, d_pos_z,
3993 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3994 &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
3995 &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
3996 &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
4000 SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
4001 d_mass, d_pos_x, d_pos_y, d_pos_z, d_transform,
4002 d_f_normal_x, d_f_normal_y, d_f_normal_z,
4003 &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
4004 &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
4008 groupRestraintsKernel->doForce(step, doEnergy, doVirial,
4009 myLattice, d_transform,
4010 d_mass, d_pos_x, d_pos_y, d_pos_z,
4011 d_f_normal_x, d_f_normal_y, d_f_normal_z,
4012 &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
4013 &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
4016 gridForceKernel->doForce(doEnergy, doVirial,
4018 d_pos_x, d_pos_y, d_pos_z, d_transform,
4019 d_f_normal_x, d_f_normal_y, d_f_normal_z,
4023 consForceKernel->doForce(myLattice, doVirial,
4024 d_pos_x, d_pos_y, d_pos_z,
4025 d_f_normal_x, d_f_normal_y, d_f_normal_z,
4027 &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
4028 &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
4031 if(doEnergy || doVirial) {
4033 cudaCheck(cudaStreamSynchronize(stream));
4034 if (is_lonepairs_psf || is_tip4_water) {
4035 switch (maxForceNumber) {
4037 copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
4041 copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
4045 copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
4053 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
4054 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
4060 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
4061 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
4082 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
4084 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
4089 gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4091 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
4092 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
4093 gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
4097 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
4098 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
4103 void SequencerCUDA::copyGlobalForcesToDevice(){
4106 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
4108 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4110 for(
int i =0 ; i < numPatchesHome; i++){
4112 const int patchID = record.
patchID;
4114 const int numPatchAtoms = record.
numAtoms;
4116 memcpy(f_global_x + stride, current.
f_global_x, numPatchAtoms*
sizeof(
double));
4117 memcpy(f_global_y + stride, current.
f_global_y, numPatchAtoms*
sizeof(
double));
4118 memcpy(f_global_z + stride, current.
f_global_z, numPatchAtoms*
sizeof(
double));
4121 copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
4122 copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
4123 copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);
4127 void SequencerCUDA::updateHostPatchDataSOA() {
4128 std::vector<PatchDataSOA> host_copy(numPatchesHome);
4129 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4131 for(
int i =0 ; i < numPatchesHome; i++) {
4132 host_copy[i] = homePatches[i]->patchDataSOA;
4134 copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
4138 void SequencerCUDA::saveForceCUDASOA_direct(
4139 const bool doGlobal,
const bool doForcesOutput,
const int maxForceNumber) {
4140 CUDASequencerKernel->copyForcesToHostSOA(
4142 patchData->devData[deviceIndex].d_localPatches,
4158 cudaCheck(cudaStreamSynchronize(stream));
4161 void SequencerCUDA::copyPositionsToHost_direct() {
4162 CUDASequencerKernel->copyPositionsToHostSOA(
4164 patchData->devData[deviceIndex].d_localPatches,
4171 cudaCheck(cudaStreamSynchronize(stream));
4174 void SequencerCUDA::redistributeTip4pForces(
4175 const int maxForceNumber,
4176 const int doVirial) {
4177 CUDASequencerKernel->redistributeTip4pForces(
4178 d_f_normal_x, d_f_normal_y, d_f_normal_z,
4179 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
4180 d_f_slow_x, d_f_slow_y, d_f_slow_z,
4181 d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
4182 d_pos_x, d_pos_y, d_pos_z, d_mass,
4183 numAtomsHome, doVirial, maxForceNumber, stream
4187 void SequencerCUDA::allocateGPUSavedForces() {
4188 allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
4189 allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
4190 allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
4191 allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
4192 allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
4193 allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
4196 void SequencerCUDA::submitReductionValues() {
4197 reduction->submit();
NAMD_HOST_DEVICE void rescale(Tensor factor)
#define NAMD_EVENT_STOP(eon, id)
NAMD_HOST_DEVICE Vector c() const
int periodic_a(void) const
#define curandCheck(stmt)
static BigReal dielectric_1
static void partition(int *order, const FullAtom *atoms, int begin, int end)
static PatchMap * Object()
int periodic_c(void) const
BigReal alchElecLambdaStart
#define ADD_TENSOR_OBJECT(R, RL, D)
void deallocate_device(T **pp)
int32 * moleculeAtom
atom index for all molecules
HomePatchList * homePatchList()
std::ostream & endi(std::ostream &s)
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
std::ostream & iWARN(std::ostream &s)
SubmitReduction * willSubmit(int setID, int size=-1)
static ReductionMgr * Object(void)
Patch * patch(PatchID pid)
#define COPY_CUDATENSOR(S, D)
static PatchMap * ObjectOnPe(int pe)
HomePatch * homePatch(PatchID pid)
PatchLevelPmeData patchLevelPmeData
Molecule stores the structural information for the system.
void setupDevicePeerAccess()
std::atomic< int > reducerSMDDevice
const char * get_atomtype(int anum) const
__thread DeviceCUDA * deviceCUDA
std::atomic< int > reducerGroupRestraintDevice
void updateAtomCount(const int n, const int reallocate)
#define NAMD_CRASH_ATOM_TOO_FAST
FullAtomList & getAtomList()
void copyIntFlags(const Flags &flags)
NAMD_HOST_DEVICE double3 make_double3(float3 a)
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
static AtomMap * ObjectOnPe(int pe)
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
void NAMD_bug(const char *err_msg)
#define COPY_CUDAVECTOR(S, D)
static ComputeCUDAMgr * getComputeCUDAMgr()
double * vel_x
Jim recommends double precision velocity.
int numMolecules
Number of 1-4 atom pairs with NBThole defined.
void allocate_host(T **pp, const size_t len)
PatchID getPatchID() const
int getPesSharingDevice(const int i)
NAMD_HOST_DEVICE Vector a_r() const
NAMD_HOST_DEVICE Vector b_r() const
void NAMD_die(const char *err_msg)
int periodic_b(void) const
NAMD_HOST_DEVICE Vector c_r() const
NAMD_HOST_DEVICE Vector b() const
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
#define ADD_VECTOR_OBJECT(R, RL, D)
int32 * moleculeStartIndex
starting index of each molecule
BigReal pairlistTolerance
size_t alchGetNumOfPMEGrids() const
std::ostream & iERROR(std::ostream &s)
NAMD_HOST_DEVICE Vector a() const
#define namd_reciprocal(x)
int getDeviceIDforPe(int pe)
int getNumPesSharingDevice()
SimParameters *const simParams
NAMD_HOST_DEVICE Vector unit(void) const
CudaPmeOneDevice * getCudaPmeOneDevice()
int32 numAtoms
number of atoms