18 #ifdef NODEGROUP_FORCE_REGISTER 22 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 24 for (int i = 0; i < numPatchesHome; i++) { \ 25 PatchDataSOA& current = patchData->devData[deviceIndex].patches[i]->patchDataSOA; \ 26 const int numPatchAtoms = current.numAtoms; \ 27 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 28 offset += numPatchAtoms; \ 30 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHome, stream); \ 33 #define AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 35 for (int i = 0; i < numPatchesHomeAndProxy; i++) { \ 36 PatchDataSOA& current = patchListHomeAndProxy[i]->patchDataSOA; \ 37 const int numPatchAtoms = current.numAtoms; \ 38 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 39 offset += numPatchAtoms; \ 41 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtomsHomeAndProxy, stream); \ 45 #define AGGREGATE_HOME_ATOMS_TO_DEVICE(fieldName, type, stream) do { \ 47 for (HomePatchElem *elem = patchMap->homePatchList()->begin(); elem != patchMap->homePatchList()->end(); elem++) { \ 48 PatchDataSOA& current = elem->patch->patchDataSOA; \ 49 const int numPatchAtoms = current.numAtoms; \ 50 memcpy(fieldName + offset, current.fieldName, numPatchAtoms*sizeof(type)); \ 51 offset += numPatchAtoms; \ 53 copy_HtoD<type>(fieldName, d_ ## fieldName, numAtoms, stream); \ 58 void SequencerCUDA::registerSOAPointersToHost(){
60 patchData->h_soa_pos_x[deviceIndex] = this->d_pos_x;
61 patchData->h_soa_pos_y[deviceIndex] = this->d_pos_y;
62 patchData->h_soa_pos_z[deviceIndex] = this->d_pos_z;
64 patchData->h_soa_vel_x[deviceIndex] = this->d_vel_x;
65 patchData->h_soa_vel_y[deviceIndex] = this->d_vel_y;
66 patchData->h_soa_vel_z[deviceIndex] = this->d_vel_z;
68 patchData->h_soa_fb_x[deviceIndex] = this->d_f_normal_x;
69 patchData->h_soa_fb_y[deviceIndex] = this->d_f_normal_y;
70 patchData->h_soa_fb_z[deviceIndex] = this->d_f_normal_z;
72 patchData->h_soa_fn_x[deviceIndex] = this->d_f_nbond_x;
73 patchData->h_soa_fn_y[deviceIndex] = this->d_f_nbond_y;
74 patchData->h_soa_fn_z[deviceIndex] = this->d_f_nbond_z;
76 patchData->h_soa_fs_x[deviceIndex] = this->d_f_slow_x;
77 patchData->h_soa_fs_y[deviceIndex] = this->d_f_slow_y;
78 patchData->h_soa_fs_z[deviceIndex] = this->d_f_slow_z;
80 patchData->h_soa_id[deviceIndex] = this->d_idMig;
81 patchData->h_soa_vdwType[deviceIndex] = this->d_vdwType;
82 patchData->h_soa_sortOrder[deviceIndex] = this->d_sortOrder;
83 patchData->h_soa_unsortOrder[deviceIndex] = this->d_unsortOrder;
84 patchData->h_soa_charge[deviceIndex] = this->d_charge;
85 patchData->h_soa_patchCenter[deviceIndex] = this->d_patchCenter;
86 patchData->h_soa_migrationDestination[deviceIndex] = this->d_migrationDestination;
87 patchData->h_soa_sortSoluteIndex[deviceIndex] = this->d_sortSoluteIndex;
89 patchData->h_soa_partition[deviceIndex] = this->d_partition;
91 patchData->h_atomdata_AoS[deviceIndex] = (
FullAtom*) this->d_atomdata_AoS_in;
92 patchData->h_peer_record[deviceIndex] = patchData->devData[deviceIndex].d_localPatches;
95 void SequencerCUDA::copySOAHostRegisterToDevice(){
99 copy_HtoD<double*>(patchData->h_soa_pos_x, this->d_peer_pos_x, nDevices, stream);
100 copy_HtoD<double*>(patchData->h_soa_pos_y, this->d_peer_pos_y, nDevices, stream);
101 copy_HtoD<double*>(patchData->h_soa_pos_z, this->d_peer_pos_z, nDevices, stream);
103 copy_HtoD<float*>(patchData->h_soa_charge, this->d_peer_charge, nDevices, stream);
105 copy_HtoD<int*>(patchData->h_soa_partition, this->d_peer_partition, nDevices, stream);
108 copy_HtoD<double*>(patchData->h_soa_vel_x, this->d_peer_vel_x, nDevices, stream);
109 copy_HtoD<double*>(patchData->h_soa_vel_y, this->d_peer_vel_y, nDevices, stream);
110 copy_HtoD<double*>(patchData->h_soa_vel_z, this->d_peer_vel_z, nDevices, stream);
112 copy_HtoD<double*>(patchData->h_soa_fb_x, this->d_peer_fb_x, nDevices, stream);
113 copy_HtoD<double*>(patchData->h_soa_fb_y, this->d_peer_fb_y, nDevices, stream);
114 copy_HtoD<double*>(patchData->h_soa_fb_z, this->d_peer_fb_z, nDevices, stream);
116 copy_HtoD<double*>(patchData->h_soa_fn_x, this->d_peer_fn_x, nDevices, stream);
117 copy_HtoD<double*>(patchData->h_soa_fn_y, this->d_peer_fn_y, nDevices, stream);
118 copy_HtoD<double*>(patchData->h_soa_fn_z, this->d_peer_fn_z, nDevices, stream);
120 copy_HtoD<double*>(patchData->h_soa_fs_x, this->d_peer_fs_x, nDevices, stream);
121 copy_HtoD<double*>(patchData->h_soa_fs_y, this->d_peer_fs_y, nDevices, stream);
122 copy_HtoD<double*>(patchData->h_soa_fs_z, this->d_peer_fs_z, nDevices, stream);
124 copy_HtoD<int4*>(patchData->h_soa_migrationDestination, this->d_peer_migrationDestination, nDevices, stream);
125 copy_HtoD<int*>(patchData->h_soa_sortSoluteIndex, this->d_peer_sortSoluteIndex, nDevices, stream);
127 copy_HtoD<int*>(patchData->h_soa_id, this->d_peer_id, nDevices, stream);
128 copy_HtoD<int*>(patchData->h_soa_vdwType, this->d_peer_vdwType, nDevices, stream);
129 copy_HtoD<int*>(patchData->h_soa_sortOrder, this->d_peer_sortOrder, nDevices, stream);
130 copy_HtoD<int*>(patchData->h_soa_unsortOrder, this->d_peer_unsortOrder, nDevices, stream);
131 copy_HtoD<double3*>(patchData->h_soa_patchCenter, this->d_peer_patchCenter, nDevices, stream);
133 copy_HtoD<FullAtom*>(patchData->h_atomdata_AoS, this->d_peer_atomdata, nDevices, stream);
134 copy_HtoD<CudaLocalRecord*>(patchData->h_peer_record, this->d_peer_record, nDevices, stream);
137 for(
int i = 0; i < this->nDevices; i++)
138 h_patchRecordHasForces[i] = patchData->devData[i].d_hasPatches;
139 copy_HtoD_sync<bool*>(h_patchRecordHasForces, d_patchRecordHasForces, this->nDevices);
142 void SequencerCUDA::printSOAPositionsAndVelocities() {
153 copy_DtoH_sync<BigReal>(d_posNew_x, h_pos_x, numAtomsHome);
154 copy_DtoH_sync<BigReal>(d_posNew_y, h_pos_y, numAtomsHome);
155 copy_DtoH_sync<BigReal>(d_posNew_z, h_pos_z, numAtomsHome);
157 copy_DtoH_sync<BigReal>(d_pos_x, h_pos_x, numAtomsHome);
158 copy_DtoH_sync<BigReal>(d_pos_y, h_pos_y, numAtomsHome);
159 copy_DtoH_sync<BigReal>(d_pos_z, h_pos_z, numAtomsHome);
162 copy_DtoH_sync<BigReal>(d_vel_x, h_vel_x, numAtomsHome);
163 copy_DtoH_sync<BigReal>(d_vel_y, h_vel_y, numAtomsHome);
164 copy_DtoH_sync<BigReal>(d_vel_z, h_vel_z, numAtomsHome);
166 CmiLock(this->patchData->printlock);
167 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
169 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
170 for(
int i =0 ; i < numPatchesHome; i++){
172 const int patchID = record.
patchID;
174 const int numPatchAtoms = record.
numAtoms;
177 fprintf(stderr,
"Patch [%d]:\n", patchID);
178 for(
int j = 0; j < numPatchAtoms; j++){
179 fprintf(stderr,
" [%d, %d, %d] = %lf %lf %lf %lf %lf %lf\n", j, stride + j, current.
id[j],
180 h_pos_x[stride + j], h_pos_y[stride + j], h_pos_z[stride + j],
181 h_vel_x[stride + j], h_vel_y[stride + j], h_vel_z[stride + j]);
185 CmiUnlock(this->patchData->printlock);
195 void SequencerCUDA::printSOAForces(
char *prefix) {
208 copy_DtoH_sync<BigReal>(d_f_normal_x, h_f_normal_x, numAtomsHome);
209 copy_DtoH_sync<BigReal>(d_f_normal_y, h_f_normal_y, numAtomsHome);
210 copy_DtoH_sync<BigReal>(d_f_normal_z, h_f_normal_z, numAtomsHome);
212 copy_DtoH_sync<BigReal>(d_f_nbond_x, h_f_nbond_x, numAtomsHome);
213 copy_DtoH_sync<BigReal>(d_f_nbond_y, h_f_nbond_y, numAtomsHome);
214 copy_DtoH_sync<BigReal>(d_f_nbond_z, h_f_nbond_z, numAtomsHome);
216 copy_DtoH_sync<BigReal>(d_f_slow_x, h_f_slow_x, numAtomsHome);
217 copy_DtoH_sync<BigReal>(d_f_slow_y, h_f_slow_y, numAtomsHome);
218 copy_DtoH_sync<BigReal>(d_f_slow_z, h_f_slow_z, numAtomsHome);
221 CmiLock(this->patchData->printlock);
222 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
224 fprintf(stderr,
"PE[%d] force printout\n", CkMyPe());
225 for(
int i =0 ; i < numPatchesHome; i++){
227 const int patchID = record.
patchID;
229 const int numPatchAtoms = record.
numAtoms;
230 FILE *outfile=stderr;
233 snprintf(fname,100,
"%s-patch-%d", prefix, patchID);
234 outfile = fopen(fname,
"w");
236 fprintf(outfile,
"Patch [%d]:\n", patchID);
237 for(
int j = 0; j < numPatchAtoms; j++){
238 fprintf(outfile,
" [%d] = %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", j,
239 h_f_normal_x[stride+j], h_f_normal_y[stride+j], h_f_normal_z[stride+j],
240 h_f_nbond_x[stride+j], h_f_nbond_y[stride+j], h_f_nbond_z[stride+j],
241 h_f_slow_x[stride+j], h_f_slow_y[stride+j], h_f_slow_z[stride+j] );
243 if(prefix!=NULL) fclose(outfile);
246 CmiUnlock(this->patchData->printlock);
262 SequencerCUDA* SequencerCUDA::InstanceInit(
const int deviceID_ID,
264 if (CkpvAccess(SequencerCUDA_instance) == 0) {
265 CkpvAccess(SequencerCUDA_instance) =
new SequencerCUDA(deviceID_ID, sim_Params);
267 return CkpvAccess(SequencerCUDA_instance);
270 SequencerCUDA::SequencerCUDA(
const int deviceID_ID,
272 deviceID(deviceID_ID),
simParams(sim_Params)
274 restraintsKernel = NULL;
276 groupRestraintsKernel = NULL;
277 gridForceKernel = NULL;
278 consForceKernel = NULL;
279 lonepairsKernel =
nullptr;
281 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
282 patchData = cpdata.ckLocalBranch();
285 CUDASequencerKernel =
new SequencerCUDAKernel();
286 CUDAMigrationKernel =
new MigrationCUDAKernel();
288 used_grids.resize(num_used_grids, 0);
326 SequencerCUDA::~SequencerCUDA(){
329 deallocateStaticArrays();
330 deallocate_device<SettleParameters>(&sp);
331 deallocate_device<int>(&settleList);
332 deallocate_device<CudaRattleElem>(&rattleList);
333 deallocate_device<int>(&d_consFailure);
334 if (CUDASequencerKernel != NULL)
delete CUDASequencerKernel;
335 if (CUDAMigrationKernel != NULL)
delete CUDAMigrationKernel;
336 if (restraintsKernel != NULL)
delete restraintsKernel;
337 if(SMDKernel != NULL)
delete SMDKernel;
338 if (groupRestraintsKernel != NULL)
delete groupRestraintsKernel;
339 if (gridForceKernel != NULL)
delete gridForceKernel;
340 if (lonepairsKernel !=
nullptr)
delete lonepairsKernel;
341 if (consForceKernel != NULL)
delete consForceKernel;
347 CmiDestroyLock(printlock);
351 void SequencerCUDA::zeroScalars(){
352 numAtomsHomeAndProxyAllocated = 0;
353 numAtomsHomeAllocated = 0;
354 buildRigidLists =
true;
355 numPatchesCheckedIn = 0;
359 void SequencerCUDA::initialize(){
363 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 364 int leastPriority, greatestPriority;
365 cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
367 cudaCheck(cudaStreamCreateWithPriority(&stream, cudaStreamDefault, greatestPriority));
371 cudaCheck(cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, greatestPriority));
376 curandCheck(curandCreateGenerator(&curandGen, CURAND_RNG_PSEUDO_DEFAULT));
378 unsigned long long seed =
simParams->randomSeed + CkMyPe();
379 curandCheck(curandSetPseudoRandomGeneratorSeed(curandGen, seed));
381 numAtomsHomeAllocated = 0;
382 numAtomsHomeAndProxyAllocated = 0;
384 totalMarginViolations = 0;
385 buildRigidLists =
true;
386 numPatchesCheckedIn = 0;
394 mGpuOn = nDevices > 1;
401 printlock = CmiCreateLock();
403 const int numPes = CkNumPes();
405 atomMapList.resize(numPes);
409 allocate_device<double*>(&d_peer_pos_x, nDevices);
410 allocate_device<double*>(&d_peer_pos_y, nDevices);
411 allocate_device<double*>(&d_peer_pos_z, nDevices);
412 allocate_device<float*>(&d_peer_charge, nDevices);
414 allocate_device<int*>(&d_peer_partition, nDevices);
416 allocate_device<double*>(&d_peer_vel_x, nDevices);
417 allocate_device<double*>(&d_peer_vel_y, nDevices);
418 allocate_device<double*>(&d_peer_vel_z, nDevices);
421 allocate_device<cudaTensor>(&d_fixVirialNormal, 1);
422 allocate_device<cudaTensor>(&d_fixVirialNbond, 1);
423 allocate_device<cudaTensor>(&d_fixVirialSlow, 1);
424 allocate_device<double3>(&d_fixForceNormal, 1);
425 allocate_device<double3>(&d_fixForceNbond, 1);
426 allocate_device<double3>(&d_fixForceSlow, 1);
430 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
431 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
432 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
435 allocate_device<double*>(&d_peer_fb_x, nDevices);
436 allocate_device<double*>(&d_peer_fb_y, nDevices);
437 allocate_device<double*>(&d_peer_fb_z, nDevices);
438 allocate_device<double*>(&d_peer_fn_x, nDevices);
439 allocate_device<double*>(&d_peer_fn_y, nDevices);
440 allocate_device<double*>(&d_peer_fn_z, nDevices);
441 allocate_device<double*>(&d_peer_fs_x, nDevices);
442 allocate_device<double*>(&d_peer_fs_y, nDevices);
443 allocate_device<double*>(&d_peer_fs_z, nDevices);
445 allocate_device<int4*>(&d_peer_migrationDestination, nDevices);
446 allocate_device<int*>(&d_peer_sortSoluteIndex, nDevices);
447 allocate_device<int*>(&d_peer_id, nDevices);
448 allocate_device<int*>(&d_peer_vdwType, nDevices);
449 allocate_device<int*>(&d_peer_sortOrder, nDevices);
450 allocate_device<int*>(&d_peer_unsortOrder, nDevices);
451 allocate_device<double3*>(&d_peer_patchCenter, nDevices);
453 allocate_device<FullAtom*>(&d_peer_atomdata, nDevices);
454 allocate_device<CudaLocalRecord*>(&d_peer_record, nDevices);
456 allocate_device<bool*>(&d_patchRecordHasForces, nDevices);
458 allocate_host<bool*>(&h_patchRecordHasForces, nDevices);
460 allocate_host<CudaAtom*>(&cudaAtomLists, numPatchesGlobal);
461 allocate_host<double3>(&patchCenter, numPatchesGlobal);
462 allocate_host<int>(&globalToLocalID, numPatchesGlobal);
463 allocate_host<int>(&patchToDeviceMap,numPatchesGlobal);
464 allocate_host<double3>(&awayDists, numPatchesGlobal);
465 allocate_host<double3>(&patchMin, numPatchesGlobal);
466 allocate_host<double3>(&patchMax, numPatchesGlobal);
468 allocate_host<Lattice>(&pairlist_lattices, numPatchesGlobal);
469 allocate_host<double>(&patchMaxAtomMovement, numPatchesGlobal);
470 allocate_host<double>(&patchNewTolerance, numPatchesGlobal);
471 allocate_host<CudaMInfo>(&mInfo, numPatchesGlobal);
474 allocate_device<double3>(&d_awayDists, numPatchesGlobal);
475 allocate_device<double3>(&d_patchMin, numPatchesGlobal);
476 allocate_device<double3>(&d_patchMax, numPatchesGlobal);
477 allocate_device<int>(&d_globalToLocalID, numPatchesGlobal);
478 allocate_device<int>(&d_patchToDeviceMap, numPatchesGlobal);
479 allocate_device<double3>(&d_patchCenter, numPatchesGlobal);
480 allocate_device<Lattice>(&d_lattices, numPatchesGlobal);
481 allocate_device<Lattice>(&d_pairlist_lattices, numPatchesGlobal);
482 allocate_device<double>(&d_patchMaxAtomMovement, numPatchesGlobal);
483 allocate_device<double>(&d_patchNewTolerance, numPatchesGlobal);
484 allocate_device<CudaMInfo>(&d_mInfo, numPatchesGlobal);
487 allocate_device<int>(&d_killme, 1);
488 allocate_device<char>(&d_barrierFlag, 1);
489 allocate_device<unsigned int>(&d_tbcatomic, 5);
490 allocate_device<BigReal>(&d_kineticEnergy,
ATOMIC_BINS);
491 allocate_device<BigReal>(&d_intKineticEnergy,
ATOMIC_BINS);
492 allocate_device<BigReal>(&d_momentum_x,
ATOMIC_BINS);
493 allocate_device<BigReal>(&d_momentum_y,
ATOMIC_BINS);
494 allocate_device<BigReal>(&d_momentum_z,
ATOMIC_BINS);
495 allocate_device<BigReal>(&d_angularMomentum_x,
ATOMIC_BINS);
496 allocate_device<BigReal>(&d_angularMomentum_y,
ATOMIC_BINS);
497 allocate_device<BigReal>(&d_angularMomentum_z,
ATOMIC_BINS);
498 allocate_device<cudaTensor>(&d_virial,
ATOMIC_BINS);
499 allocate_device<cudaTensor>(&d_intVirialNormal,
ATOMIC_BINS);
500 allocate_device<cudaTensor>(&d_intVirialNbond,
ATOMIC_BINS);
501 allocate_device<cudaTensor>(&d_intVirialSlow,
ATOMIC_BINS);
502 allocate_device<cudaTensor>(&d_rigidVirial,
ATOMIC_BINS);
504 allocate_device<cudaTensor>(&d_lpVirialNormal, 1);
505 allocate_device<cudaTensor>(&d_lpVirialNbond, 1);
506 allocate_device<cudaTensor>(&d_lpVirialSlow, 1);
508 allocate_device<cudaTensor>(&d_extVirial,
ATOMIC_BINS * EXT_FORCE_TOTAL);
509 allocate_device<double3>(&d_extForce,
ATOMIC_BINS * EXT_FORCE_TOTAL);
510 allocate_device<double>(&d_extEnergy,
ATOMIC_BINS * EXT_FORCE_TOTAL);
512 allocate_device<SettleParameters>(&sp, 1);
515 allocate_host<int>(&killme, 1);
516 allocate_host<BigReal>(&kineticEnergy, 1);
517 allocate_host<BigReal>(&intKineticEnergy, 1);
518 allocate_host<BigReal>(&kineticEnergy_half, 1);
519 allocate_host<BigReal>(&intKineticEnergy_half, 1);
520 allocate_host<BigReal>(&momentum_x, 1);
521 allocate_host<BigReal>(&momentum_y, 1);
522 allocate_host<BigReal>(&momentum_z, 1);
523 allocate_host<BigReal>(&angularMomentum_x, 1);
524 allocate_host<BigReal>(&angularMomentum_y, 1);
525 allocate_host<BigReal>(&angularMomentum_z, 1);
526 allocate_host<int>(&consFailure, 1);
527 allocate_host<double>(&extEnergy, EXT_FORCE_TOTAL);
528 allocate_host<double3>(&extForce, EXT_FORCE_TOTAL);
529 allocate_host<unsigned int>(&h_marginViolations, 1);
530 allocate_host<unsigned int>(&h_periodicCellSmall, 1);
533 allocate_host<cudaTensor>(&virial, 1);
534 allocate_host<cudaTensor>(&virial_half, 1);
535 allocate_host<cudaTensor>(&intVirialNormal, 1);
536 allocate_host<cudaTensor>(&intVirialNormal_half, 1);
537 allocate_host<cudaTensor>(&intVirialNbond, 1);
538 allocate_host<cudaTensor>(&intVirialSlow, 1);
539 allocate_host<cudaTensor>(&rigidVirial, 1);
540 allocate_host<cudaTensor>(&extVirial, EXT_FORCE_TOTAL);
541 allocate_host<cudaTensor>(&lpVirialNormal, 1);
542 allocate_host<cudaTensor>(&lpVirialNbond, 1);
543 allocate_host<cudaTensor>(&lpVirialSlow, 1);
546 d_f_saved_nbond_x =
nullptr;
547 d_f_saved_nbond_y =
nullptr;
548 d_f_saved_nbond_z =
nullptr;
549 d_f_saved_slow_x =
nullptr;
550 d_f_saved_slow_y =
nullptr;
551 d_f_saved_slow_z =
nullptr;
554 *kineticEnergy = 0.0;
555 *intKineticEnergy = 0.0;
556 *kineticEnergy_half = 0.0;
557 *intKineticEnergy_half = 0.0;
561 *angularMomentum_x = 0.0;
562 *angularMomentum_y = 0.0;
563 *angularMomentum_z = 0.0;
572 t_setComputePositions = 0;
573 t_accumulateForceKick = 0;
576 t_submitReductions1 = 0;
577 t_submitReductions2 = 0;
579 cudaEventCreate(&eventStart);
580 cudaEventCreate(&eventStop);
581 cudaCheck(cudaMemset(d_patchNewTolerance, 0,
sizeof(
BigReal)*numPatchesGlobal));
583 cudaCheck(cudaMemset(d_tbcatomic, 0,
sizeof(
unsigned int) * 5));
603 memset(h_marginViolations, 0,
sizeof(
unsigned int));
604 memset(h_periodicCellSmall, 0,
sizeof(
unsigned int));
607 memset(intVirialNormal, 0,
sizeof(
cudaTensor));
608 memset(intVirialNbond, 0,
sizeof(
cudaTensor));
610 memset(lpVirialNormal, 0,
sizeof(
cudaTensor));
613 memset(globalToLocalID, -1,
sizeof(
int)*numPatchesGlobal);
619 d_consFailure = NULL;
620 d_consFailureSize = 0;
625 numPatchesHome = numPatchesGlobal;
633 for(
int i = 0; i < numPes; i++) {
637 for (
int j = 0; j < npatch; j++) {
641 patchList.push_back(patch);
642 patchNewTolerance[count++] =
646 patchData->devData[deviceIndex].patches.push_back(patch);
647 patchListHomeAndProxy.push_back(patch);
665 for (
int i = 0; i < numPes; ++i) {
676 patchData->devData[deviceIndex].patches.push_back(patch);
683 #ifdef NAMD_NCCL_ALLREDUCE 684 deviceCUDA->setNcclUniqueId(patchData->ncclId);
696 restraintsKernel =
new ComputeRestraintsCUDA(patchList, atomMapList,
706 groupRestraintsKernel =
new ComputeGroupRestraintsCUDA(
simParams->outputEnergies,
711 gridForceKernel =
new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
719 consForceKernel =
new ComputeConsForceCUDA(patchList, atomMapList,mGpuOn);
741 void SequencerCUDA::updateDeviceKernels()
746 if(patchData->updateCounter.fetch_sub(1)>=1)
748 if(gridForceKernel!=NULL)
750 delete gridForceKernel;
751 gridForceKernel =
new ComputeGridForceCUDA(patchData->devData[deviceIndex].patches, atomMapList, stream);
752 gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
754 if(consForceKernel!=NULL)
756 delete consForceKernel;
757 consForceKernel =
new ComputeConsForceCUDA(patchList, atomMapList,
759 consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
766 bool SequencerCUDA::reallocateArrays(
int in_numAtomsHome,
int in_numAtomsHomeAndProxy)
769 const float OVERALLOC = 1.5f;
771 if (in_numAtomsHomeAndProxy <= numAtomsHomeAndProxyAllocated && in_numAtomsHome <= numAtomsHomeAllocated ) {
777 bool realloc_gpu_saved_force =
false;
778 if (d_f_saved_nbond_x !=
nullptr || d_f_saved_slow_x !=
nullptr) {
779 realloc_gpu_saved_force =
true;
785 numAtomsHomeAndProxyAllocated = (int) ((
float) in_numAtomsHomeAndProxy * OVERALLOC);
786 numAtomsHomeAllocated = (int) ((
float) in_numAtomsHome * OVERALLOC);
788 allocate_host<double>(&f_normal_x, numAtomsHomeAndProxyAllocated);
789 allocate_host<double>(&f_normal_y, numAtomsHomeAndProxyAllocated);
790 allocate_host<double>(&f_normal_z, numAtomsHomeAndProxyAllocated);
791 allocate_host<double>(&f_nbond_x, numAtomsHomeAndProxyAllocated);
792 allocate_host<double>(&f_nbond_y, numAtomsHomeAndProxyAllocated);
793 allocate_host<double>(&f_nbond_z, numAtomsHomeAndProxyAllocated);
794 allocate_host<double>(&f_slow_x, numAtomsHomeAndProxyAllocated);
795 allocate_host<double>(&f_slow_y, numAtomsHomeAndProxyAllocated);
796 allocate_host<double>(&f_slow_z, numAtomsHomeAndProxyAllocated);
797 allocate_host<double>(&pos_x, numAtomsHomeAndProxyAllocated);
798 allocate_host<double>(&pos_y, numAtomsHomeAndProxyAllocated);
799 allocate_host<double>(&pos_z, numAtomsHomeAndProxyAllocated);
801 allocate_host<double>(&f_global_x, numAtomsHomeAndProxyAllocated);
802 allocate_host<double>(&f_global_y, numAtomsHomeAndProxyAllocated);
803 allocate_host<double>(&f_global_z, numAtomsHomeAndProxyAllocated);
805 allocate_host<float>(&charge, numAtomsHomeAndProxyAllocated);
806 allocate_host<int>(&sortOrder, numAtomsHomeAndProxyAllocated);
807 allocate_host<int>(&unsortOrder, numAtomsHomeAndProxyAllocated);
809 allocate_host<double>(&recipMass, numAtomsHomeAllocated);
810 allocate_host<double>(&vel_x, numAtomsHomeAllocated);
811 allocate_host<double>(&vel_y, numAtomsHomeAllocated);
812 allocate_host<double>(&vel_z, numAtomsHomeAllocated);
813 allocate_host<char3>(&transform, numAtomsHomeAllocated);
814 allocate_host<float>(&mass, numAtomsHomeAllocated);
816 allocate_host<int>(&
partition, numAtomsHomeAndProxyAllocated);
818 allocate_host<float>(&langevinParam, numAtomsHomeAllocated);
819 allocate_host<float>(&langScalVelBBK2, numAtomsHomeAllocated);
820 allocate_host<float>(&langScalRandBBK2, numAtomsHomeAllocated);
825 int n = (numAtomsHomeAllocated + 1) & (~1);
826 allocate_host<int>(&hydrogenGroupSize, numAtomsHomeAllocated);
828 allocate_host<int>(&atomFixed, numAtomsHomeAllocated);
830 allocate_host<int>(&groupFixed, numAtomsHomeAllocated);
835 allocate_host<float>(&rigidBondLength, numAtomsHomeAllocated);
839 allocate_host<int>(&idMig, numAtomsHomeAllocated);
840 allocate_host<int>(&vdwType, numAtomsHomeAllocated);
843 allocate_device<double>(&d_f_raw, 9 * numAtomsHomeAndProxyAllocated);
844 d_f_normal_x = &d_f_raw[numAtomsHomeAndProxyAllocated*0];
845 d_f_normal_y = &d_f_raw[numAtomsHomeAndProxyAllocated*1];
846 d_f_normal_z = &d_f_raw[numAtomsHomeAndProxyAllocated*2];
847 d_f_nbond_x = &d_f_raw[numAtomsHomeAndProxyAllocated*3];
848 d_f_nbond_y = &d_f_raw[numAtomsHomeAndProxyAllocated*4];
849 d_f_nbond_z = &d_f_raw[numAtomsHomeAndProxyAllocated*5];
850 d_f_slow_x = &d_f_raw[numAtomsHomeAndProxyAllocated*6];
851 d_f_slow_y = &d_f_raw[numAtomsHomeAndProxyAllocated*7];
852 d_f_slow_z = &d_f_raw[numAtomsHomeAndProxyAllocated*8];
853 allocate_device<double>(&d_pos_raw, 3 * numAtomsHomeAndProxyAllocated);
854 d_pos_x = &d_pos_raw[numAtomsHomeAndProxyAllocated*0];
855 d_pos_y = &d_pos_raw[numAtomsHomeAndProxyAllocated*1];
856 d_pos_z = &d_pos_raw[numAtomsHomeAndProxyAllocated*2];
857 allocate_device<float>(&d_charge, numAtomsHomeAndProxyAllocated);
859 allocate_device<double>(&d_f_global_x, numAtomsHomeAndProxyAllocated);
860 allocate_device<double>(&d_f_global_y, numAtomsHomeAndProxyAllocated);
861 allocate_device<double>(&d_f_global_z, numAtomsHomeAndProxyAllocated);
863 if (realloc_gpu_saved_force) {
864 allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
865 allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
866 allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
867 allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
868 allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
869 allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
871 allocate_device<int>(&d_sortOrder, numAtomsHomeAndProxyAllocated);
872 allocate_device<int>(&d_unsortOrder, numAtomsHomeAndProxyAllocated);
877 allocate_device<double>(&d_f_rawMC, numAtomsHomeAndProxyAllocated*9);
878 allocate_device<double>(&d_pos_rawMC, numAtomsHomeAndProxyAllocated*3);
879 d_f_normalMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*0];
880 d_f_normalMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*1];
881 d_f_normalMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*2];
882 d_f_nbondMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*3];
883 d_f_nbondMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*4];
884 d_f_nbondMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*5];
885 d_f_slowMC_x = &d_f_rawMC[numAtomsHomeAndProxyAllocated*6];
886 d_f_slowMC_y = &d_f_rawMC[numAtomsHomeAndProxyAllocated*7];
887 d_f_slowMC_z = &d_f_rawMC[numAtomsHomeAndProxyAllocated*8];
888 d_posMC_x = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*0];
889 d_posMC_y = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*1];
890 d_posMC_z = &d_pos_rawMC[numAtomsHomeAndProxyAllocated*2];
892 allocate_host<int>(&id, numAtomsHomeAndProxyAllocated);
893 allocate_device<int>(&d_id, numAtomsHomeAndProxyAllocated);
894 allocate_device<int>(&d_idOrder, numAtomsHomeAndProxyAllocated);
895 allocate_device<int>(&d_moleculeAtom, numAtomsHomeAndProxyAllocated);
897 allocate_device<int>(&d_moleculeStartIndex, numAtomsHomeAndProxyAllocated);
901 allocate_device<int>(&d_partition, numAtomsHomeAndProxyAllocated);
904 allocate_device<double>(&d_posNew_raw, 3 * numAtomsHomeAllocated);
905 d_posNew_x = &d_posNew_raw[numAtomsHomeAllocated*0];
906 d_posNew_y = &d_posNew_raw[numAtomsHomeAllocated*1];
907 d_posNew_z = &d_posNew_raw[numAtomsHomeAllocated*2];
908 allocate_device<double>(&d_vel_x, numAtomsHomeAllocated);
909 allocate_device<double>(&d_vel_y, numAtomsHomeAllocated);
910 allocate_device<double>(&d_vel_z, numAtomsHomeAllocated);
911 allocate_device<double>(&d_recipMass, numAtomsHomeAllocated);
912 allocate_device<char3>(&d_transform, numAtomsHomeAllocated);
913 allocate_device<double>(&d_velNew_x, numAtomsHomeAllocated);
914 allocate_device<double>(&d_velNew_y, numAtomsHomeAllocated);
915 allocate_device<double>(&d_velNew_z, numAtomsHomeAllocated);
916 allocate_device<double>(&d_posSave_x, numAtomsHomeAllocated);
917 allocate_device<double>(&d_posSave_y, numAtomsHomeAllocated);
918 allocate_device<double>(&d_posSave_z, numAtomsHomeAllocated);
919 allocate_device<double>(&d_rcm_x, numAtomsHomeAllocated);
920 allocate_device<double>(&d_rcm_y, numAtomsHomeAllocated);
921 allocate_device<double>(&d_rcm_z, numAtomsHomeAllocated);
922 allocate_device<double>(&d_vcm_x, numAtomsHomeAllocated);
923 allocate_device<double>(&d_vcm_y, numAtomsHomeAllocated);
924 allocate_device<double>(&d_vcm_z, numAtomsHomeAllocated);
926 allocate_device<float>(&d_mass, numAtomsHomeAllocated);
927 allocate_device<float>(&d_langevinParam, numAtomsHomeAllocated);
928 allocate_device<float>(&d_langScalVelBBK2, numAtomsHomeAllocated);
929 allocate_device<float>(&d_langScalRandBBK2, numAtomsHomeAllocated);
930 allocate_device<float>(&d_gaussrand_x, numAtomsHomeAllocated);
931 allocate_device<float>(&d_gaussrand_y, numAtomsHomeAllocated);
932 allocate_device<float>(&d_gaussrand_z, numAtomsHomeAllocated);
933 allocate_device<int>(&d_hydrogenGroupSize, numAtomsHomeAllocated);
934 allocate_device<float>(&d_rigidBondLength, numAtomsHomeAllocated);
936 allocate_device<int>(&d_atomFixed, numAtomsHomeAllocated);
938 allocate_device<int>(&d_groupFixed, numAtomsHomeAllocated);
939 allocate_device<double>(&d_fixedPosition_x, numAtomsHomeAllocated);
940 allocate_device<double>(&d_fixedPosition_y, numAtomsHomeAllocated);
941 allocate_device<double>(&d_fixedPosition_z, numAtomsHomeAllocated);
945 allocate_device<int>(&d_idMig, numAtomsHomeAndProxyAllocated);
946 allocate_device<int>(&d_vdwType, numAtomsHomeAndProxyAllocated);
947 allocate_device<FullAtom>(&d_atomdata_AoS, numAtomsHomeAllocated);
948 allocate_device<int>(&d_migrationGroupSize, numAtomsHomeAllocated);
949 allocate_device<int>(&d_migrationGroupIndex, numAtomsHomeAllocated);
950 allocate_device<int>(&d_sortIndex, numAtomsHomeAllocated);
954 d_f_saved_nbond_x =
nullptr;
955 d_f_saved_nbond_y =
nullptr;
956 d_f_saved_nbond_z =
nullptr;
957 d_f_saved_slow_x =
nullptr;
958 d_f_saved_slow_y =
nullptr;
959 d_f_saved_slow_z =
nullptr;
962 memset(pos_x, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
963 memset(pos_y, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
964 memset(pos_z, 0,
sizeof(
double)*numAtomsHomeAndProxyAllocated);
965 cudaCheck(cudaMemset(d_pos_x, 0 ,
sizeof(
double)*numAtomsHomeAndProxyAllocated));
966 cudaCheck(cudaMemset(d_pos_y, 0 ,
sizeof(
double)*numAtomsHomeAndProxyAllocated));
967 cudaCheck(cudaMemset(d_pos_z, 0 ,
sizeof(
double)*numAtomsHomeAndProxyAllocated));
968 cudaCheck(cudaMemset(d_vel_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
969 cudaCheck(cudaMemset(d_vel_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
970 cudaCheck(cudaMemset(d_vel_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
972 cudaCheck(cudaMemset(d_posNew_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
973 cudaCheck(cudaMemset(d_posNew_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
974 cudaCheck(cudaMemset(d_posNew_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
975 cudaCheck(cudaMemset(d_velNew_x, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
976 cudaCheck(cudaMemset(d_velNew_y, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
977 cudaCheck(cudaMemset(d_velNew_z, 0 ,
sizeof(
double)*numAtomsHomeAllocated));
982 void SequencerCUDA::reallocateMigrationDestination() {
983 if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
984 allocate_device<int4>(&d_migrationDestination, numAtomsHomeAndProxyAllocated);
987 void SequencerCUDA::deallocateArrays() {
988 if (numAtomsHomeAndProxyAllocated != 0) {
991 deallocate_host<double>(&f_normal_x);
992 deallocate_host<double>(&f_normal_y);
993 deallocate_host<double>(&f_normal_z);
995 deallocate_host<double>(&f_global_x);
996 deallocate_host<double>(&f_global_y);
997 deallocate_host<double>(&f_global_z);
999 deallocate_host<double>(&f_nbond_x);
1000 deallocate_host<double>(&f_nbond_y);
1001 deallocate_host<double>(&f_nbond_z);
1002 deallocate_host<double>(&f_slow_x);
1003 deallocate_host<double>(&f_slow_y);
1004 deallocate_host<double>(&f_slow_z);
1005 deallocate_host<double>(&pos_x);
1006 deallocate_host<double>(&pos_y);
1007 deallocate_host<double>(&pos_z);
1008 deallocate_host<float>(&charge);
1009 deallocate_host<int>(&sortOrder);
1010 deallocate_host<int>(&unsortOrder);
1011 deallocate_host<double>(&recipMass);
1012 deallocate_host<double>(&vel_x);
1013 deallocate_host<double>(&vel_y);
1014 deallocate_host<double>(&vel_z);
1015 deallocate_host<char3>(&transform);
1016 deallocate_host<float>(&mass);
1020 deallocate_host<float>(&langevinParam);
1021 deallocate_host<float>(&langScalVelBBK2);
1022 deallocate_host<float>(&langScalRandBBK2);
1024 deallocate_host<int>(&hydrogenGroupSize);
1025 deallocate_host<int>(&atomFixed);
1027 deallocate_host<int>(&groupFixed);
1028 deallocate_host<double>(&fixedPosition_x);
1029 deallocate_host<double>(&fixedPosition_y);
1030 deallocate_host<double>(&fixedPosition_z);
1033 deallocate_host<float>(&rigidBondLength);
1035 deallocate_device<double>(&d_f_raw);
1036 deallocate_device<double>(&d_pos_raw);
1037 deallocate_device<float>(&d_charge);
1038 deallocate_device<int>(&d_sortOrder);
1039 deallocate_device<int>(&d_unsortOrder);
1041 deallocate_device<int>(&d_partition);
1044 deallocate_device<double>(&d_posNew_raw);
1047 deallocate_device<double>(&d_f_rawMC);
1048 deallocate_device<double>(&d_pos_rawMC);
1050 deallocate_host<int>(&id);
1051 deallocate_device<int>(&d_id);
1052 deallocate_device<int>(&d_idOrder);
1053 deallocate_device<int>(&d_moleculeAtom);
1054 deallocate_device<int>(&d_moleculeStartIndex);
1058 deallocate_host<int>(&idMig);
1059 deallocate_host<int>(&vdwType);
1060 deallocate_device<int>(&d_idMig);
1061 deallocate_device<int>(&d_vdwType);
1062 deallocate_device<FullAtom>(&d_atomdata_AoS);
1063 deallocate_device<int>(&d_migrationGroupSize);
1064 deallocate_device<int>(&d_migrationGroupIndex);
1065 deallocate_device<int>(&d_sortIndex);
1068 deallocate_device<double>(&d_f_global_x);
1069 deallocate_device<double>(&d_f_global_y);
1070 deallocate_device<double>(&d_f_global_z);
1072 deallocate_device<double>(&d_vel_x);
1073 deallocate_device<double>(&d_vel_y);
1074 deallocate_device<double>(&d_vel_z);
1075 deallocate_device<double>(&d_recipMass);
1076 deallocate_device<char3>(&d_transform);
1077 deallocate_device<double>(&d_velNew_x);
1078 deallocate_device<double>(&d_velNew_y);
1079 deallocate_device<double>(&d_velNew_z);
1080 deallocate_device<double>(&d_posSave_x);
1081 deallocate_device<double>(&d_posSave_y);
1082 deallocate_device<double>(&d_posSave_z);
1083 deallocate_device<double>(&d_rcm_x);
1084 deallocate_device<double>(&d_rcm_y);
1085 deallocate_device<double>(&d_rcm_z);
1086 deallocate_device<double>(&d_vcm_x);
1087 deallocate_device<double>(&d_vcm_y);
1088 deallocate_device<double>(&d_vcm_z);
1089 deallocate_device<float>(&d_mass);
1090 deallocate_device<float>(&d_langevinParam);
1091 deallocate_device<float>(&d_langScalVelBBK2);
1092 deallocate_device<float>(&d_langScalRandBBK2);
1093 deallocate_device<float>(&d_gaussrand_x);
1094 deallocate_device<float>(&d_gaussrand_y);
1095 deallocate_device<float>(&d_gaussrand_z);
1096 deallocate_device<int>(&d_hydrogenGroupSize);
1097 deallocate_device<float>(&d_rigidBondLength);
1098 deallocate_device<int>(&d_atomFixed);
1100 deallocate_device<int>(&d_groupFixed);
1101 deallocate_device<double>(&d_fixedPosition_x);
1102 deallocate_device<double>(&d_fixedPosition_y);
1103 deallocate_device<double>(&d_fixedPosition_z);
1105 deallocate_device<double>(&d_f_saved_nbond_x);
1106 deallocate_device<double>(&d_f_saved_nbond_y);
1107 deallocate_device<double>(&d_f_saved_nbond_z);
1108 deallocate_device<double>(&d_f_saved_slow_x);
1109 deallocate_device<double>(&d_f_saved_slow_y);
1110 deallocate_device<double>(&d_f_saved_slow_z);
1114 void SequencerCUDA::deallocateStaticArrays() {
1117 deallocate_host<cudaTensor>(&extVirial);
1118 deallocate_host<double3>(&extForce);
1119 deallocate_host<double>(&extEnergy);
1120 deallocate_host<unsigned int>(&h_marginViolations);
1121 deallocate_host<unsigned int>(&h_periodicCellSmall);
1124 deallocate_host<double3>(&awayDists);
1125 deallocate_host<double3>(&patchMin);
1126 deallocate_host<double3>(&patchMax);
1127 deallocate_host<CudaAtom*>(&cudaAtomLists);
1128 deallocate_host<double3>(&patchCenter);
1129 deallocate_host<int>(&globalToLocalID);
1130 deallocate_host<int>(&patchToDeviceMap);
1131 deallocate_host<Lattice>(&pairlist_lattices);
1132 deallocate_host<double>(&patchMaxAtomMovement);
1133 deallocate_host<double>(&patchNewTolerance);
1134 deallocate_host<CudaMInfo>(&mInfo);
1135 deallocate_host<bool*>(&h_patchRecordHasForces);
1137 deallocate_host<cudaTensor>(&lpVirialNormal);
1138 deallocate_host<cudaTensor>(&lpVirialNbond);
1139 deallocate_host<cudaTensor>(&lpVirialSlow);
1141 deallocate_device<double3>(&d_awayDists);
1142 deallocate_device<double3>(&d_patchMin);
1143 deallocate_device<double3>(&d_patchMax);
1144 deallocate_device<int>(&d_globalToLocalID);
1145 deallocate_device<int>(&d_patchToDeviceMap);
1146 deallocate_device<int>(&d_sortOrder);
1147 deallocate_device<int>(&d_unsortOrder);
1148 deallocate_device<double3>(&d_patchCenter);
1149 deallocate_device<Lattice>(&d_lattices);
1150 deallocate_device<Lattice>(&d_pairlist_lattices);
1151 deallocate_device<double>(&d_patchMaxAtomMovement);
1152 deallocate_device<double>(&d_patchNewTolerance);
1153 deallocate_device<CudaMInfo>(&d_mInfo);
1155 deallocate_device<int>(&d_killme);
1156 deallocate_device<char>(&d_barrierFlag);
1157 deallocate_device<unsigned int>(&d_tbcatomic);
1158 deallocate_device<BigReal>(&d_kineticEnergy);
1159 deallocate_device<BigReal>(&d_intKineticEnergy);
1160 deallocate_device<BigReal>(&d_momentum_x);
1161 deallocate_device<BigReal>(&d_momentum_y);
1162 deallocate_device<BigReal>(&d_momentum_z);
1163 deallocate_device<BigReal>(&d_angularMomentum_x);
1164 deallocate_device<BigReal>(&d_angularMomentum_y);
1165 deallocate_device<BigReal>(&d_angularMomentum_z);
1166 deallocate_device<cudaTensor>(&d_virial);
1167 deallocate_device<cudaTensor>(&d_intVirialNormal);
1168 deallocate_device<cudaTensor>(&d_intVirialNbond);
1169 deallocate_device<cudaTensor>(&d_intVirialSlow);
1170 deallocate_device<cudaTensor>(&d_lpVirialNormal);
1171 deallocate_device<cudaTensor>(&d_lpVirialNbond);
1172 deallocate_device<cudaTensor>(&d_lpVirialSlow);
1173 deallocate_device<cudaTensor>(&d_rigidVirial);
1174 deallocate_device<cudaTensor>(&d_extVirial);
1175 deallocate_device<double3>(&d_extForce);
1176 deallocate_device<double>(&d_extEnergy);
1177 deallocate_device<SettleParameters>(&sp);
1178 deallocate_device<unsigned int>(&deviceQueue);
1179 deallocate_device<double*>(&d_peer_pos_x);
1180 deallocate_device<double*>(&d_peer_pos_y);
1181 deallocate_device<double*>(&d_peer_pos_z);
1182 deallocate_device<float*>(&d_peer_charge);
1183 deallocate_device<int*>(&d_peer_partition);
1184 deallocate_device<double*>(&d_peer_vel_x);
1185 deallocate_device<double*>(&d_peer_vel_y);
1186 deallocate_device<double*>(&d_peer_vel_z);
1187 deallocate_device<double*>(&d_peer_fb_x);
1188 deallocate_device<double*>(&d_peer_fb_y);
1189 deallocate_device<double*>(&d_peer_fb_z);
1190 deallocate_device<double*>(&d_peer_fn_x);
1191 deallocate_device<double*>(&d_peer_fn_y);
1192 deallocate_device<double*>(&d_peer_fn_z);
1193 deallocate_device<double*>(&d_peer_fs_x);
1194 deallocate_device<double*>(&d_peer_fs_y);
1195 deallocate_device<double*>(&d_peer_fs_z);
1197 deallocate_device<bool*>(&d_patchRecordHasForces);
1207 deallocate_device<FullAtom>(&d_atomdata_AoS_in);
1208 deallocate_device<int>(&d_sortSoluteIndex);
1209 if (d_migrationDestination != NULL) deallocate_device<int4>(&d_migrationDestination);
1212 deallocate_device<PatchDataSOA>(&d_HostPatchDataSOA);
1215 void SequencerCUDA::copyMigrationInfo(
HomePatch *p,
int patchIndex){
1217 if (!p->patchMapRead) p->readPatchMap();
1218 for(
int x = 0; x < 3; x++){
1219 for(
int y = 0; y < 3; y++){
1220 for(
int z = 0; z < 3; z++){
1233 void SequencerCUDA::assembleOrderedPatchList(){
1238 for (
int i = 0; i < patchData->devData[deviceIndex].patches.size(); i++) {
1239 HomePatch *p = patchData->devData[deviceIndex].patches[i];
1240 patchList.push_back(p);
1246 patchListHomeAndProxy.clear();
1248 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1249 for (
int i = 0; i < numPatchesHomeAndProxy; i++) {
1250 const int patchID = localPatches[i].
patchID;
1253 for(
int d = 0; d < CkNumPes(); d++){
1257 patchListHomeAndProxy.push_back(patch);
1272 void SequencerCUDA::copyAoSDataToHost() {
1273 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1274 patchData = cpdata.ckLocalBranch();
1276 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1277 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1279 CUDAMigrationKernel->update_AoS(
1281 patchData->devData[deviceIndex].d_localPatches,
1283 d_vel_x, d_vel_y, d_vel_z,
1284 d_pos_x, d_pos_y, d_pos_z,
1288 for (
int i = 0; i < integrationPatches.size(); i++) {
1289 const int numAtoms = localPatches[i].numAtoms;
1290 const int offset = localPatches[i].bufferOffset;
1291 HomePatch *patch = integrationPatches[i];
1295 copy_DtoH<FullAtom>(d_atomdata_AoS + offset, (
FullAtom*)h_atomdata.
begin(), numAtoms, stream);
1297 cudaCheck(cudaStreamSynchronize(stream));
1307 void SequencerCUDA::migrationLocalInit() {
1308 CUDAMigrationKernel->computeMigrationGroupIndex(
1310 patchData->devData[deviceIndex].d_localPatches,
1311 d_migrationGroupSize,
1312 d_migrationGroupIndex,
1316 CUDAMigrationKernel->update_AoS(
1318 patchData->devData[deviceIndex].d_localPatches,
1320 d_vel_x, d_vel_y, d_vel_z,
1321 d_pos_x, d_pos_y, d_pos_z,
1325 CUDAMigrationKernel->computeMigrationDestination(
1327 patchData->devData[deviceIndex].d_localPatches,
1334 d_hydrogenGroupSize,
1335 d_migrationGroupSize,
1336 d_migrationGroupIndex,
1337 d_pos_x, d_pos_y, d_pos_z,
1338 d_migrationDestination,
1342 CUDAMigrationKernel->performLocalMigration(
1344 patchData->devData[deviceIndex].d_localPatches,
1347 d_migrationDestination,
1351 cudaCheck(cudaStreamSynchronize(stream));
1358 void SequencerCUDA::migrationPerform() {
1359 CUDAMigrationKernel->performMigration(
1361 patchData->devData[deviceIndex].d_localPatches,
1365 d_migrationGroupSize,
1366 d_migrationGroupIndex,
1367 d_migrationDestination,
1370 cudaCheck(cudaStreamSynchronize(stream));
1374 void SequencerCUDA::migrationUpdateAtomCounts() {
1375 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1376 patchData = cpdata.ckLocalBranch();
1378 CUDAMigrationKernel->updateLocalRecords(
1380 patchData->devData[deviceIndex].d_localPatches,
1382 patchData->devData[deviceIndex].d_peerPatches,
1386 cudaCheck(cudaStreamSynchronize(stream));
1389 void SequencerCUDA::migrationUpdateAtomOffsets() {
1390 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1391 patchData = cpdata.ckLocalBranch();
1393 CUDAMigrationKernel->updateLocalRecordsOffset(
1394 numPatchesHomeAndProxy,
1395 patchData->devData[deviceIndex].d_localPatches,
1399 cudaCheck(cudaStreamSynchronize(stream));
1402 void SequencerCUDA::migrationUpdateRemoteOffsets() {
1403 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1404 patchData = cpdata.ckLocalBranch();
1406 CUDAMigrationKernel->updatePeerRecords(
1407 numPatchesHomeAndProxy,
1408 patchData->devData[deviceIndex].d_localPatches,
1410 patchData->devData[deviceIndex].d_peerPatches,
1414 cudaCheck(cudaStreamSynchronize(stream));
1417 void SequencerCUDA::migrationUpdateProxyDestination() {
1419 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1420 patchData = cpdata.ckLocalBranch();
1424 CUDAMigrationKernel->copyMigrationDestinationToProxies(
1427 numPatchesHomeAndProxy,
1428 patchData->devData[deviceIndex].d_localPatches,
1429 patchData->devData[deviceIndex].d_peerPatches,
1430 d_peer_migrationDestination,
1436 void SequencerCUDA::copyPatchDataToHost() {
1437 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1438 patchData = cpdata.ckLocalBranch();
1439 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1441 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1442 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1444 copy_DtoH<CudaLocalRecord>(patchData->devData[deviceIndex].d_localPatches, localPatches.data(), numPatchesHomeAndProxy, stream);
1445 cudaCheck(cudaStreamSynchronize(stream));
1449 for (
int i = 0; i < numPatchesHome; i++) {
1453 cudaCheck(cudaStreamSynchronize(stream));
1457 void SequencerCUDA::copyAtomDataToDeviceAoS() {
1458 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1459 patchData = cpdata.ckLocalBranch();
1461 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1462 const int numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1463 std::vector<HomePatch*>& integrationPatches = patchData->devData[deviceIndex].patches;
1466 for (
int i = 0; i < integrationPatches.size(); i++) {
1467 const int numAtoms = localPatches[i].numAtoms;
1468 if (numAtoms > MigrationCUDAKernel::kMaxAtomsPerPatch) {
1469 iout <<
iERROR <<
"The number of atoms in patch " << i <<
" is " 1470 << numAtoms <<
", greater than the limit for GPU atom migration (" 1471 << MigrationCUDAKernel::kMaxAtomsPerPatch <<
").\n" <<
endi;
1472 NAMD_bug(
"NAMD has stopped simulating due to the error above, " 1473 "but you could disable GPUAtomMigration and try again.\n");
1475 const int offset = localPatches[i].bufferOffset;
1476 HomePatch *patch = integrationPatches[i];
1478 copy_HtoD<FullAtom>((
FullAtom*)h_atomdata.
begin(), d_atomdata_AoS_in + ((int64_t) i) * MigrationCUDAKernel::kMaxAtomsPerPatch, numAtoms, stream);
1480 cudaCheck(cudaStreamSynchronize(stream));
1490 void SequencerCUDA::copyAtomDataToDevice(
bool copyForces,
int maxForceNumber) {
1492 AGGREGATE_HOME_ATOMS_TO_DEVICE(recipMass,
double, stream);
1494 switch (maxForceNumber) {
1496 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_x,
double, stream);
1497 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_y,
double, stream);
1498 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_slow_z,
double, stream);
1500 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_x,
double, stream);
1501 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_y,
double, stream);
1502 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_nbond_z,
double, stream);
1504 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_x,
double, stream);
1505 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_y,
double, stream);
1506 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(f_normal_z,
double, stream);
1510 AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_x,
double, stream);
1511 AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_y,
double, stream);
1512 AGGREGATE_HOME_ATOMS_TO_DEVICE(vel_z,
double, stream);
1513 AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_x,
double, stream);
1514 AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_y,
double, stream);
1515 AGGREGATE_HOME_ATOMS_TO_DEVICE(pos_z,
double, stream);
1516 AGGREGATE_HOME_ATOMS_TO_DEVICE(mass,
float, stream);
1517 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(charge,
float, stream);
1519 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(
partition,
int, stream);
1523 AGGREGATE_HOME_ATOMS_TO_DEVICE(langevinParam,
float, stream);
1524 AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalVelBBK2,
float, stream);
1525 AGGREGATE_HOME_ATOMS_TO_DEVICE(langScalRandBBK2,
float, stream);
1528 AGGREGATE_HOME_ATOMS_TO_DEVICE(hydrogenGroupSize,
int, stream);
1529 AGGREGATE_HOME_ATOMS_TO_DEVICE(atomFixed,
int, stream);
1531 AGGREGATE_HOME_ATOMS_TO_DEVICE(groupFixed,
int, stream);
1532 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_x,
double, stream);
1533 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_y,
double, stream);
1534 AGGREGATE_HOME_ATOMS_TO_DEVICE(fixedPosition_z,
double, stream);
1536 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(sortOrder,
int, stream);
1537 AGGREGATE_HOME_AND_PROXY_ATOMS_TO_DEVICE(unsortOrder,
int, stream);
1538 AGGREGATE_HOME_ATOMS_TO_DEVICE(rigidBondLength,
float, stream);
1541 AGGREGATE_HOME_ATOMS_TO_DEVICE(
id,
int, stream);
1543 CUDASequencerKernel->SetAtomIndexOrder(d_id, d_idOrder, numAtomsHome, stream);
1548 void SequencerCUDA::migrationLocalPost(
int startup) {
1549 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1550 patchData = cpdata.ckLocalBranch();
1554 CUDAMigrationKernel->transformMigratedPositions(
1556 patchData->devData[deviceIndex].d_localPatches,
1565 CUDAMigrationKernel->sortSolventAtoms(
1567 patchData->devData[deviceIndex].d_localPatches,
1576 double tempFactor = 1.0;
1581 tempFactor = (lesReduceTemp ? 1. /
simParams->lesFactor : 1);
1583 CUDAMigrationKernel->copy_AoS_to_SoA(
1585 simParams->langevinOn, dt, kbT, tempFactor,
1586 patchData->devData[deviceIndex].d_localPatches,
1589 d_vel_x, d_vel_y, d_vel_z,
1590 d_pos_x, d_pos_y, d_pos_z,
1593 d_hydrogenGroupSize, d_migrationGroupSize,
1608 CUDASequencerKernel->SetAtomIndexOrder(d_idMig, d_idOrder, numAtomsHome, stream);
1613 copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
1614 copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
1615 copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
1619 myLatticeOld = myLattice;
1622 CUDASequencerKernel->centerOfMass(
1623 d_pos_x, d_pos_y, d_pos_z,
1624 d_rcm_x, d_rcm_y, d_rcm_z,
1625 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1626 CUDASequencerKernel->centerOfMass(
1627 d_vel_x, d_vel_y, d_vel_z,
1628 d_vcm_x, d_vcm_y, d_vcm_z,
1629 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
1631 cudaCheck(cudaStreamSynchronize(stream));
1634 void SequencerCUDA::migrationUpdateAdvancedFeatures(
const int startup) {
1642 for (
int i = 0; i < numPatchesHome; i++) {
1644 const int numPatchAtoms = current.
numAtoms;
1646 for(
int j = 0; j < numPatchAtoms; j++){
1651 offset += numPatchAtoms;
1653 copy_HtoD<char3>(transform, d_transform, numAtomsHome, stream);
1658 lonepairsKernel->updateAtoms(patchList, atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID, stream);
1661 restraintsKernel->updateRestrainedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1664 SMDKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1667 groupRestraintsKernel->updateAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1671 gridForceKernel->updateGriddedAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, patchData->devData[deviceIndex].patches, globalToLocalID, mGpuOn);
1674 consForceKernel->updateConsForceAtoms(atomMapList, patchData->devData[deviceIndex].h_localPatches, globalToLocalID);
1680 void SequencerCUDA::migrationUpdateDestination() {
1681 CUDAMigrationKernel->updateMigrationDestination(
1683 d_migrationDestination,
1684 d_peer_sortSoluteIndex,
1689 bool SequencerCUDA::copyPatchData(
1693 bool reallocated =
false;
1695 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1696 patchData = cpdata.ckLocalBranch();
1698 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
1700 std::vector<CudaPeerRecord>& peerPatches = patchData->devData[deviceIndex].h_peerPatches;
1701 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
1705 numPatchesHomeAndProxy = patchData->devData[deviceIndex].numPatchesHomeAndProxy;
1706 numPatchesHome = homePatches.size();
1707 patchData->devData[deviceIndex].numPatchesHome = numPatchesHome;
1711 allocate_device<FullAtom>(&d_atomdata_AoS_in, ((int64_t) numPatchesHome) * MigrationCUDAKernel::kMaxAtomsPerPatch);
1712 allocate_device<int>(&d_sortSoluteIndex, numPatchesHome * MigrationCUDAKernel::kMaxAtomsPerPatch);
1713 d_migrationDestination = NULL;
1715 #if defined(NAMD_HIP) 1716 hipExtMallocWithFlags((
void**)&patchData->devData[deviceIndex].d_localPatches,
1718 hipDeviceMallocFinegrained);
1719 hipExtMallocWithFlags((
void**)&patchData->devData[deviceIndex].d_peerPatches,
1721 hipDeviceMallocFinegrained);
1723 allocate_device<CudaLocalRecord>(&patchData->devData[deviceIndex].d_localPatches, numPatchesHomeAndProxy);
1724 allocate_device<CudaPeerRecord>(&patchData->devData[deviceIndex].d_peerPatches, peerPatches.size());
1727 CUDAMigrationKernel->allocateScratch(numPatchesHomeAndProxy);
1730 copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1731 numPatchesHomeAndProxy, stream);
1732 copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1733 peerPatches.size(), stream);
1734 if(
true || mGpuOn) {
1735 this->assembleOrderedPatchList();
1737 this->copySettleParameter();
1739 for (
int i = 0; i < numPatchesHome; i++) {
1741 this->copyMigrationInfo(patch, i);
1745 patchToDeviceMap[patch->
getPatchID()] = deviceIndex;
1747 copy_HtoD<double>(patchNewTolerance, d_patchNewTolerance, numPatchesHome, stream);
1748 copy_HtoD<CudaMInfo>(mInfo, d_mInfo, numPatchesHome, stream);
1754 if (i == deviceIndex)
continue;
1755 std::vector<HomePatch*>& otherPatches = patchData->devData[i].patches;
1756 for (
int j = 0; j < otherPatches.size(); j++) {
1762 copy_HtoD<int>(globalToLocalID, d_globalToLocalID, numPatchesGlobal, stream);
1763 copy_HtoD<int>(patchToDeviceMap, d_patchToDeviceMap, numPatchesGlobal, stream);
1764 patchData->devData[deviceIndex].d_globalToLocalID = d_globalToLocalID;
1765 patchData->devData[deviceIndex].d_patchToDeviceMap = d_patchToDeviceMap;
1768 allocate_device<PatchDataSOA>(&d_HostPatchDataSOA, numPatchesHome);
1771 for (
int i = 0; i < numPatchesHomeAndProxy; i++) {
1772 HomePatch *patch = patchListHomeAndProxy[i];
1773 awayDists[i].x = patch->aAwayDist;
1774 awayDists[i].y = patch->bAwayDist;
1775 awayDists[i].z = patch->cAwayDist;
1781 copy_HtoD<double3>(awayDists, d_awayDists, numPatchesHomeAndProxy, stream);
1782 copy_HtoD<double3>(patchMin, d_patchMin, numPatchesHomeAndProxy, stream);
1783 copy_HtoD<double3>(patchMax, d_patchMax, numPatchesHomeAndProxy, stream);
1784 copy_HtoD<double3>(patchCenter, d_patchCenter, numPatchesHomeAndProxy, stream);
1786 const int totalAtomCount = localPatches[numPatchesHomeAndProxy-1].bufferOffset +
1787 localPatches[numPatchesHomeAndProxy-1].numAtoms;
1789 const int homeAtomCount = localPatches[numPatchesHome-1].bufferOffset +
1790 localPatches[numPatchesHome-1].numAtoms;
1792 reallocated = reallocateArrays(homeAtomCount, totalAtomCount);
1795 numAtomsHomePrev = numAtomsHome;
1796 numAtomsHomeAndProxy = totalAtomCount;
1797 numAtomsHome = homeAtomCount;
1799 patchData->devData[deviceIndex].numAtomsHome = numAtomsHome;
1802 copy_HtoD<CudaLocalRecord>(localPatches.data(), patchData->devData[deviceIndex].d_localPatches,
1803 numPatchesHomeAndProxy, stream);
1804 copy_HtoD<CudaPeerRecord>(peerPatches.data(), patchData->devData[deviceIndex].d_peerPatches,
1805 peerPatches.size(), stream);
1811 copy_HtoD<int>(molecule->
moleculeAtom, d_moleculeAtom, numAtomsHome, stream);
1819 void SequencerCUDA::copyDataToPeers(
1822 if (!copyIn)
return;
1827 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
1828 patchData = cpdata.ckLocalBranch();
1830 CUDAMigrationKernel->copyDataToProxies(
1833 numPatchesHomeAndProxy,
1834 patchData->devData[deviceIndex].d_localPatches,
1846 cudaCheck(cudaStreamSynchronize(stream));
1849 void SequencerCUDA::migrationSortAtomsNonbonded() {
1850 CUDAMigrationKernel->sortAtoms(
1851 numPatchesHome, numAtomsHome,
1852 patchData->devData[deviceIndex].d_localPatches,
1854 d_pos_x, d_pos_y, d_pos_z,
1862 void SequencerCUDA::maximumMove(
1863 const double maxvel2,
1866 CUDASequencerKernel->maximumMove(
1867 maxvel2, d_vel_x, d_vel_y, d_vel_z,
1868 killme, numAtoms, stream);
1871 void SequencerCUDA::submitHalf(
1873 int numAtoms,
int part,
const bool doEnergy)
1879 Tensor reduction_intVirialNormal;
1885 cudaCheck(cudaEventRecord(eventStart,stream));
1887 CUDASequencerKernel->submitHalf(
1889 d_vel_x, d_vel_y, d_vel_z,
1890 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
1891 d_kineticEnergy, d_intKineticEnergy,
1892 d_virial, d_intVirialNormal, kineticEnergy_half, intKineticEnergy_half,
1893 virial_half, intVirialNormal_half,
1894 d_hydrogenGroupSize, numAtoms, d_tbcatomic, stream);
1896 cudaCheck(cudaEventRecord(eventStop, stream));
1897 cudaCheck(cudaEventSynchronize(eventStop));
1898 cudaCheck(cudaEventElapsedTime(&t_submitHalf, eventStart, eventStop));
1899 fprintf(stderr,
"submitHalf total elapsed time: %f\n", t_submitHalf);
1900 t_submitReductions2 = 0;
1905 void SequencerCUDA::submitReductions(
1910 int marginViolations,
1913 int numAtomsReduction,
1922 CUDASequencerKernel->submitReduction1(
1923 d_pos_x, d_pos_y, d_pos_z,
1924 d_vel_x, d_vel_y, d_vel_z, d_mass,
1926 d_momentum_x, d_momentum_y, d_momentum_z,
1927 d_angularMomentum_x, d_angularMomentum_y, d_angularMomentum_z,
1928 origin_x, origin_y, origin_z, kineticEnergy, momentum_x, momentum_y,
1929 momentum_z, angularMomentum_x, angularMomentum_y, angularMomentum_z, d_tbcatomic,
1930 numAtomsReduction, stream);
1932 Tensor regintVirialNormal;
1933 Tensor regintVirialNbond;
1937 cudaCheck(cudaEventRecord(eventStart,stream));
1939 CUDASequencerKernel->submitReduction2(
1941 d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
1942 d_rcm_x, d_rcm_y, d_rcm_z, d_vcm_x, d_vcm_y, d_vcm_z,
1943 d_f_normal_x, d_f_normal_y, d_f_normal_z,
1944 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1945 d_f_slow_x, d_f_slow_y, d_f_slow_z,
1946 d_mass, d_hydrogenGroupSize,
1947 d_kineticEnergy, kineticEnergy,
1948 d_intKineticEnergy, intKineticEnergy,
1949 d_intVirialNormal, d_intVirialNbond, d_intVirialSlow,
1950 intVirialNormal, intVirialNbond, intVirialSlow, d_rigidVirial, rigidVirial,
1951 d_tbcatomic, numAtomsReduction, maxForceNumber,
simParams->isMultiTimeStepping(), stream);
1954 CUDASequencerKernel->calcFixVirial(
1955 maxForceNumber, numAtomsReduction, d_atomFixed,
1956 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
1957 d_f_normal_x, d_f_normal_y, d_f_normal_z,
1958 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
1959 d_f_slow_x, d_f_slow_y, d_f_slow_z,
1960 d_fixVirialNormal, d_fixVirialNbond, d_fixVirialSlow,
1961 d_fixForceNormal, d_fixForceNbond, d_fixForceSlow, stream);
1965 cudaCheck(cudaEventRecord(eventStop, stream));
1966 cudaCheck(cudaEventSynchronize(eventStop));
1967 cudaCheck(cudaEventElapsedTime(&t_submitReductions2, eventStart, eventStop));
1968 fprintf(stderr,
"submitReductions2 total elapsed time: %f\n", t_submitReductions2);
1969 t_submitReductions2 = 0;
1973 void SequencerCUDA::copySettleParameter(){
1980 for(
int i = 0; i < patchList.size(); i++){
1981 if(patchList[i]->settle_initialized) {
1982 patch = patchList[i];
1988 h_sp.
mO = patch->settle_mO;
1989 h_sp.
mH = patch->settle_mH;
1990 h_sp.
mOrmT = patch->settle_mOrmT;
1991 h_sp.
mHrmT = patch->settle_mHrmT;
1992 h_sp.
rra = patch->settle_rra;
1993 h_sp.
ra = patch->settle_ra;
1994 h_sp.
rb = patch->settle_rb;
1995 h_sp.
rc = patch->settle_rc;
1996 h_sp.
r_om = patch->r_om;
1997 h_sp.
r_ohc = patch->r_ohc;
2001 copy_HtoD<SettleParameters>(&h_sp, this->sp, 1, stream);
2007 void SequencerCUDA::startRun1(
2012 myLattice = lattice;
2015 CUDASequencerKernel->rattle1(
2017 numAtomsHome, 0.f, 0.f,
2019 d_vel_x, d_vel_y, d_vel_z,
2020 d_pos_x, d_pos_y, d_pos_z,
2021 d_velNew_x, d_velNew_y, d_velNew_z,
2022 d_posNew_x, d_posNew_y, d_posNew_z,
2023 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2024 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2025 &settleList, settleListSize, &d_consFailure,
2026 d_consFailureSize, &rattleList, rattleListSize,
2028 d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2029 buildRigidLists, consFailure,
simParams->watmodel, stream);
2031 this->copyPositionsAndVelocitiesToHost(1, 0);
2034 printSOAPositionsAndVelocities();
2038 void SequencerCUDA::startRun2(
2060 cudaCheck(cudaMemset(d_f_nbond_x, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2061 cudaCheck(cudaMemset(d_f_nbond_y, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2062 cudaCheck(cudaMemset(d_f_nbond_z, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2064 cudaCheck(cudaMemset(d_f_slow_x, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2065 cudaCheck(cudaMemset(d_f_slow_y, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2066 cudaCheck(cudaMemset(d_f_slow_z, 0,
sizeof(
double)*numAtomsHomeAndProxy));
2068 CUDASequencerKernel->accumulateForceToSOA(
2072 numPatchesHomeAndProxy,
2074 patchData->devData[deviceIndex].d_localPatches,
2075 patchData->devData[deviceIndex].f_bond,
2076 patchData->devData[deviceIndex].f_bond_nbond,
2077 patchData->devData[deviceIndex].f_bond_slow,
2078 patchData->devData[deviceIndex].forceStride,
2079 patchData->devData[deviceIndex].f_nbond,
2080 patchData->devData[deviceIndex].f_nbond_slow,
2081 patchData->devData[deviceIndex].f_slow,
2096 patchData->d_queues,
2097 patchData->d_queueCounters,
2107 printSOAPositionsAndVelocities();
2111 void SequencerCUDA::startRun3(
2117 const bool requestGlobalForces,
2118 int doGlobalMasterStaleForces,
2119 const bool requestForcesOutput,
2120 const bool requestGlobalForcesGPU,
2123 const bool doFixed =
simParams->fixedAtomsOn;
2128 std::vector<int> atom_counts;
2130 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2132 CUDASequencerKernel->mergeForcesFromPeers(
2136 numPatchesHomeAndProxy,
2149 patchData->devData[deviceIndex].d_localPatches,
2150 patchData->devData[deviceIndex].d_peerPatches,
2167 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2168 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
2171 if(doGlobalMasterStaleForces)
2173 computeGlobalMasterVirial(
2174 numPatchesHomeAndProxy,
2175 patchData->devData[deviceIndex].d_localPatches,
2183 &d_extForce[EXT_GLOBALMTS],
2184 &extForce[EXT_GLOBALMTS],
2185 &d_extVirial[EXT_GLOBALMTS],
2186 &extVirial[EXT_GLOBALMTS],
2195 calculateExternalForces(
simParams->firstTimestep, reduction, maxForceNumber, 1, 1);
2198 if(
true || deviceID == 0){
2200 snprintf(prefix, 10,
"step-%d",0);
2201 this->printSOAForces(prefix);
2205 CUDASequencerKernel->addForceToMomentum(
2206 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2208 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2209 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2210 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2211 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2212 numAtomsHome, maxForceNumber, stream);
2214 CUDASequencerKernel->rattle1(
2216 numAtomsHome, -dt_normal, -1.0/(dt_normal),
2218 d_vel_x, d_vel_y, d_vel_z,
2219 d_pos_x, d_pos_y, d_pos_z,
2220 d_velNew_x, d_velNew_y, d_velNew_z,
2221 d_posNew_x, d_posNew_y, d_posNew_z,
2222 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2223 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2224 &settleList, settleListSize, &d_consFailure,
2225 d_consFailureSize, &rattleList, rattleListSize,
2227 d_rigidVirial, rigidVirial, d_tbcatomic,
true, sp,
2228 true, consFailure,
simParams->watmodel, stream);
2230 CUDASequencerKernel->centerOfMass(
2231 d_vel_x, d_vel_y, d_vel_z,
2232 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2233 d_hydrogenGroupSize, numAtomsHome, stream);
2238 submitHalf(reduction, numAtomsHome, 1, 1);
2240 cudaCheck(cudaStreamSynchronize(stream));
2242 Tensor reduction_intVirialNormal;
2247 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2248 reduction_virial *= 0.5;
2252 += (intKineticEnergy_half[0] * 0.25);
2253 reduction_intVirialNormal *= 0.5;
2255 reduction_intVirialNormal);
2258 CUDASequencerKernel->addForceToMomentum(
2259 doFixed, 1.0, dt_normal, dt_nbond, dt_slow, 1.0,
2261 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2262 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2263 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2264 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2265 numAtomsHome, maxForceNumber, stream);
2267 CUDASequencerKernel->rattle1(
2269 numAtomsHome, dt_normal, 1.0/dt_normal,
2271 d_vel_x, d_vel_y, d_vel_z,
2272 d_pos_x, d_pos_y, d_pos_z,
2273 d_velNew_x, d_velNew_y, d_velNew_z,
2274 d_posNew_x, d_posNew_y, d_posNew_z,
2275 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2276 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
2277 &settleList, settleListSize, &d_consFailure,
2278 d_consFailureSize, &rattleList, rattleListSize,
2280 d_rigidVirial, rigidVirial, d_tbcatomic, 1, sp,
2281 buildRigidLists, consFailure,
simParams->watmodel, stream);
2283 CUDASequencerKernel->centerOfMass(
2284 d_vel_x, d_vel_y, d_vel_z,
2285 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2286 d_hydrogenGroupSize, numAtomsHome, stream);
2290 submitHalf(reduction, numAtomsHome, 1, 1);
2292 cudaCheck(cudaStreamSynchronize(stream));
2294 Tensor reduction_intVirialNormal;
2299 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2300 reduction_virial *= 0.5;
2304 += (intKineticEnergy_half[0] * 0.25);
2305 reduction_intVirialNormal *= 0.5;
2307 reduction_intVirialNormal);
2310 CUDASequencerKernel->addForceToMomentum(
2311 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, 1.0,
2313 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2314 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2315 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2316 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2317 numAtomsHome, maxForceNumber, stream);
2319 if(requestGlobalForces || requestForcesOutput) {
2322 saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
2325 if (requestGlobalForcesGPU) {
2326 if (d_f_saved_nbond_x ==
nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
2327 if (d_f_saved_nbond_y ==
nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
2328 if (d_f_saved_nbond_z ==
nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
2329 if (d_f_saved_slow_x ==
nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
2330 if (d_f_saved_slow_y ==
nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
2331 if (d_f_saved_slow_z ==
nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
2332 CUDASequencerKernel->copyForcesToDevice(
2333 numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2334 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2335 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
2336 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
2340 CUDASequencerKernel->centerOfMass(
2341 d_vel_x, d_vel_y, d_vel_z,
2342 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2343 d_hydrogenGroupSize, numAtomsHome, stream);
2345 submitReductions(reduction, origin.
x, origin.
y, origin.
z,
2346 marginViolations, 1,
2348 numAtomsHome, maxForceNumber);
2350 copyPositionsAndVelocitiesToHost(1, 0);
2358 NAMD_die(
"constraint failure during CUDA rattle!\n");
2360 iout <<
iWARN <<
"constraint failure during CUDA rattle!\n" <<
endi;
2363 cudaCheck(cudaStreamSynchronize(stream));
2365 Tensor reduction_rigidVirial;
2368 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
2374 Vector momentum(*momentum_x, *momentum_y, *momentum_z);
2376 Vector angularMomentum(*angularMomentum_x,
2378 *angularMomentum_z);
2381 Tensor regintVirialNormal;
2382 Tensor regintVirialNbond;
2385 if (maxForceNumber >= 1) {
2388 if (maxForceNumber >= 2) {
2398 cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
2399 double3 fixForceNormal, fixForceNbond, fixForceSlow;
2400 switch (maxForceNumber) {
2402 copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
2403 copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
2407 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
2410 copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
2411 copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
2415 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
2418 copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
2419 copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
2423 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
2427 auto printTensor = [](
const cudaTensor& t,
const std::string& name){
2428 CkPrintf(
"%s", name.c_str());
2429 CkPrintf(
"\n%12.5lf %12.5lf %12.5lf\n" 2430 "%12.5lf %12.5lf %12.5lf\n" 2431 "%12.5lf %12.5lf %12.5lf\n",
2436 printTensor(fixVirialNormal,
"fixVirialNormal = ");
2437 printTensor(fixVirialNbond,
"fixVirialNbond = ");
2438 printTensor(fixVirialSlow,
"fixVirialSlow = ");
2446 this->printSOAForces(NULL);
2451 printSOAPositionsAndVelocities();
2455 void SequencerCUDA::monteCarloPressure_reject(
Lattice &lattice)
2458 myLattice = lattice;
2462 temp = d_f_normal_x; d_f_normal_x = d_f_normalMC_x; d_f_normalMC_x = temp;
2463 temp = d_f_normal_y; d_f_normal_y = d_f_normalMC_y; d_f_normalMC_y = temp;
2464 temp = d_f_normal_z; d_f_normal_z = d_f_normalMC_z; d_f_normalMC_z = temp;
2465 temp = d_f_nbond_x; d_f_nbond_x = d_f_nbondMC_x; d_f_nbondMC_x = temp;
2466 temp = d_f_nbond_y; d_f_nbond_y = d_f_nbondMC_y; d_f_nbondMC_y = temp;
2467 temp = d_f_nbond_z; d_f_nbond_z = d_f_nbondMC_z; d_f_nbondMC_z = temp;
2468 temp = d_f_slow_x; d_f_slow_x = d_f_slowMC_x; d_f_slowMC_x = temp;
2469 temp = d_f_slow_y; d_f_slow_y = d_f_slowMC_y; d_f_slowMC_y = temp;
2470 temp = d_f_slow_z; d_f_slow_z = d_f_slowMC_z; d_f_slowMC_z = temp;
2471 #ifdef NAMD_NCCL_ALLREDUCE 2473 temp = d_posNew_x; d_posNew_x = d_posMC_x; d_posMC_x = temp;
2474 temp = d_posNew_y; d_posNew_y = d_posMC_y; d_posMC_y = temp;
2475 temp = d_posNew_z; d_posNew_z = d_posMC_z; d_posMC_z = temp;
2477 temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2478 temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2479 temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2482 temp = d_pos_x; d_pos_x = d_posMC_x; d_posMC_x = temp;
2483 temp = d_pos_y; d_pos_y = d_posMC_y; d_posMC_y = temp;
2484 temp = d_pos_z; d_pos_z = d_posMC_z; d_posMC_z = temp;
2488 void SequencerCUDA::monteCarloPressure_accept(
2490 const int doMigration)
2493 CUDASequencerKernel->centerOfMass(
2494 d_pos_x, d_pos_y, d_pos_z,
2495 d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2496 d_hydrogenGroupSize, numAtomsHome, stream);
2501 Tensor reduction_intVirialNormal;
2505 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
2506 reduction_virial *= 0.5;
2507 reduction_intVirialNormal *= 0.5;
2510 reduction_intVirialNormal);
2517 myLatticeOld = myLattice;
2521 void SequencerCUDA::monteCarloPressure_part1(
2528 copy_DtoD<double>(d_f_normal_x, d_f_normalMC_x, numAtomsHome, stream);
2529 copy_DtoD<double>(d_f_normal_y, d_f_normalMC_y, numAtomsHome, stream);
2530 copy_DtoD<double>(d_f_normal_z, d_f_normalMC_z, numAtomsHome, stream);
2531 copy_DtoD<double>(d_f_nbond_x, d_f_nbondMC_x, numAtomsHome, stream);
2532 copy_DtoD<double>(d_f_nbond_y, d_f_nbondMC_y, numAtomsHome, stream);
2533 copy_DtoD<double>(d_f_nbond_z, d_f_nbondMC_z, numAtomsHome, stream);
2534 copy_DtoD<double>(d_f_slow_x, d_f_slowMC_x, numAtomsHome, stream);
2535 copy_DtoD<double>(d_f_slow_y, d_f_slowMC_y, numAtomsHome, stream);
2536 copy_DtoD<double>(d_f_slow_z, d_f_slowMC_z, numAtomsHome, stream);
2537 #ifdef NAMD_NCCL_ALLREDUCE 2540 copy_DtoD<double>(d_posNew_x, d_posMC_x, numAtomsHome, stream);
2541 copy_DtoD<double>(d_posNew_y, d_posMC_y, numAtomsHome, stream);
2542 copy_DtoD<double>(d_posNew_z, d_posMC_z, numAtomsHome, stream);
2545 copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2546 copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2547 copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2551 copy_DtoD<double>(d_pos_x, d_posMC_x, numAtomsHome, stream);
2552 copy_DtoD<double>(d_pos_y, d_posMC_y, numAtomsHome, stream);
2553 copy_DtoD<double>(d_pos_z, d_posMC_z, numAtomsHome, stream);
2558 Lattice newLattice = oldLattice;
2567 CUDASequencerKernel->scaleCoordinateUsingGC(
2568 d_pos_x, d_pos_y, d_pos_z, d_idOrder, d_moleculeStartIndex,
2569 d_moleculeAtom, cuFactor, cuOrigin, myLattice, newLattice,
2574 myLattice = newLattice;
2579 bool doNbond = patchData->flags.doNonbonded;
2580 bool doSlow = patchData->flags.doFullElectrostatics;
2583 bool doAlchDecouple =
false;
2584 bool doAlchSoftCore =
false;
2587 if (
simParams->alchThermIntOn) doTI =
true;
2588 if (
simParams->alchDecouple) doAlchDecouple =
true;
2589 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2593 lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
2596 bool usePatchPme =
false;
2609 getNumPatchesHome(),
2618 std::vector<int> atom_counts;
2620 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2622 CUDASequencerKernel->set_compute_positions(
2626 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
2627 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
2628 #ifdef NAMD_NCCL_ALLREDUCE 2629 (mGpuOn) ? d_posNew_x: d_pos_x,
2630 (mGpuOn) ? d_posNew_y: d_pos_y,
2631 (mGpuOn) ? d_posNew_z: d_pos_z,
2642 d_charge, d_partition, charge_scaling,
2644 patchData->devData[deviceIndex].slow_patchPositions,
2645 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
2646 d_sortOrder, newLattice,
2647 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
2648 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
2650 patchData->devData[deviceIndex].d_localPatches,
2651 patchData->devData[deviceIndex].d_peerPatches,
2655 cudaCheck(cudaStreamSynchronize(stream));
2658 void SequencerCUDA::monteCarloPressure_part2(
2662 const bool doEnergy,
2663 const bool doGlobal,
2664 const bool doVirial)
2670 #ifdef NAMD_NCCL_ALLREDUCE 2671 cudaCheck(cudaMemset(d_f_raw, 0,
sizeof(
double)*numAtoms*3*(maxForceNumber+1)));
2675 CUDASequencerKernel->accumulateForceToSOA(
2679 numPatchesHomeAndProxy,
2681 patchData->devData[deviceIndex].d_localPatches,
2682 patchData->devData[deviceIndex].f_bond,
2683 patchData->devData[deviceIndex].f_bond_nbond,
2684 patchData->devData[deviceIndex].f_bond_slow,
2685 patchData->devData[deviceIndex].forceStride,
2686 patchData->devData[deviceIndex].f_nbond,
2687 patchData->devData[deviceIndex].f_nbond_slow,
2688 patchData->devData[deviceIndex].f_slow,
2703 patchData->d_queues,
2704 patchData->d_queueCounters,
2709 #ifndef NAMD_NCCL_ALLREDUCE 2713 std::vector<int> atom_counts;
2715 atom_counts.push_back(patchData->devData[i].numAtomsHome);
2717 CUDASequencerKernel->mergeForcesFromPeers(
2721 numPatchesHomeAndProxy,
2734 patchData->devData[deviceIndex].d_localPatches,
2735 patchData->devData[deviceIndex].d_peerPatches,
2740 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
2741 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
2745 calculateExternalForces(step, reduction, maxForceNumber, doEnergy, doVirial);
2748 if(
true || deviceID == 0){
2750 snprintf(prefix, 10,
"step-%d",step);
2751 this->printSOAForces(prefix);
2757 void SequencerCUDA::launch_part1(
2762 double velrescaling,
2763 const double maxvel2,
2768 int reassignVelocitiesStep,
2769 int langevinPistonStep,
2770 int berendsenPressureStep,
2773 const int savePairlists,
2774 const int usePairlists,
2775 const bool doEnergy)
2780 this->maxvel2 = maxvel2;
2782 const bool doFixed =
simParams->fixedAtomsOn;
2785 myLattice = lattice;
2786 if(reassignVelocitiesStep)
2788 const int reassignFreq =
simParams->reassignFreq;
2790 newTemp += ( step / reassignFreq ) *
simParams->reassignIncr;
2795 if ( newTemp < simParams->reassignHold )
2800 CUDASequencerKernel->reassignVelocities(
2801 dt_normal,
simParams->fixedAtomsOn, d_atomFixed,
2802 d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
2803 d_vel_x, d_vel_y, d_vel_z,
2805 numAtomsHome, numAtomsHome, 0,
2810 if(berendsenPressureStep) {
2815 CUDASequencerKernel->scaleCoordinateWithFactor(
2816 d_pos_x, d_pos_y, d_pos_z, d_mass, d_hydrogenGroupSize,
2817 cuFactor, cuOrigin,
simParams->useGroupPressure, numAtomsHome, stream);
2820 if(!langevinPistonStep){
2823 CUDASequencerKernel->velocityVerlet1(
2824 doFixed, patchData->flags.step, 0.5, dt_normal, dt_nbond,
2825 dt_slow, velrescaling, d_recipMass,
2826 d_vel_x, d_vel_y, d_vel_z, maxvel2, killme, d_pos_x, d_pos_y, d_pos_z,
2827 pos_x, pos_y, pos_z, d_f_normal_x, d_f_normal_y, d_f_normal_z,
2828 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z,
2829 d_atomFixed, numAtomsHome, maxForceNumber, stream);
2832 CUDASequencerKernel->addForceToMomentum(
2833 doFixed, 0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
2835 d_f_normal_x, d_f_normal_y, d_f_normal_z,
2836 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
2837 d_f_slow_x, d_f_slow_y, d_f_slow_z,
2838 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
2839 numAtomsHome, maxForceNumber, stream);
2841 maximumMove(maxvel2, numAtomsHome);
2850 CUDASequencerKernel->addVelocityToPosition(
2851 simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2852 d_vel_x, d_vel_y, d_vel_z,
2853 d_pos_x, d_pos_y, d_pos_z,
2854 pos_x, pos_y, pos_z, numAtomsHome,
false, stream);
2855 CUDASequencerKernel->langevinPiston(
2857 d_groupFixed, d_transform, lattice,
2858 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z,
2859 d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z,
2860 d_mass, d_hydrogenGroupSize,
2861 cuFactor, cuOrigin, velFactor_x, velFactor_y, velFactor_z,
2862 simParams->useGroupPressure, numAtomsHome, stream);
2863 CUDASequencerKernel->addVelocityToPosition(
2864 simParams->fixedAtomsOn, 0.5*dt_normal, d_atomFixed,
2865 d_vel_x, d_vel_y, d_vel_z,
2866 d_pos_x, d_pos_y, d_pos_z,
2867 pos_x, pos_y, pos_z, numAtomsHome,
false, stream);
2871 if( (doEnergy || doVirial) ) {
2872 CUDASequencerKernel->centerOfMass(
2873 d_pos_x, d_pos_y, d_pos_z,
2874 d_rcm_x, d_rcm_y, d_rcm_z, d_mass,
2875 d_hydrogenGroupSize, numAtomsHome, stream);
2876 CUDASequencerKernel->centerOfMass(
2877 d_vel_x, d_vel_y, d_vel_z,
2878 d_vcm_x, d_vcm_y, d_vcm_z, d_mass,
2879 d_hydrogenGroupSize, numAtomsHome, stream);
2885 bool doNbond = patchData->flags.doNonbonded;
2886 bool doSlow = patchData->flags.doFullElectrostatics;
2890 bool doAlchDecouple =
false;
2891 bool doAlchSoftCore =
false;
2894 if (
simParams->alchThermIntOn) doTI =
true;
2895 if (
simParams->alchDecouple) doAlchDecouple =
true;
2896 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2898 if ( ! savePairlists ) {
2901 double sysdima = lattice.
a_r().
unit() * lattice.
a();
2902 double sysdimb = lattice.
b_r().
unit() * lattice.
b();
2903 double sysdimc = lattice.
c_r().
unit() * lattice.
c();
2908 rescalePairlistTolerance =
false;
2909 CUDASequencerKernel->PairListMarginCheck(numPatchesHome,
2910 patchData->devData[deviceIndex].d_localPatches,
2911 d_pos_x, d_pos_y, d_pos_z, d_posSave_x, d_posSave_y, d_posSave_z,
2913 myLattice, myLatticeOld,
2914 d_patchMin, d_patchMax, d_patchCenter,
2916 d_tbcatomic,
simParams->pairlistTrigger,
2918 d_patchMaxAtomMovement, patchMaxAtomMovement,
2919 d_patchNewTolerance, patchNewTolerance,
2920 minSize,
simParams->cutoff, sysdima, sysdimb, sysdimc,
2922 h_periodicCellSmall,
2923 rescalePairlistTolerance,
2924 isPeriodic, stream);
2928 rescalePairlistTolerance =
true;
2933 cudaCheck(cudaStreamSynchronize(stream));
2937 void SequencerCUDA::launch_part11(
2941 double velrescaling,
2942 const double maxvel2,
2947 int langevinPistonStep,
2950 const int savePairlists,
2951 const int usePairlists,
2952 const bool doEnergy)
2954 const bool doVirial =
simParams->langevinPistonOn;
2958 bool doNbond = patchData->flags.doNonbonded;
2959 bool doSlow = patchData->flags.doFullElectrostatics;
2963 bool doAlchDecouple =
false;
2964 bool doAlchSoftCore =
false;
2967 if (
simParams->alchThermIntOn) doTI =
true;
2968 if (
simParams->alchDecouple) doAlchDecouple =
true;
2969 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
2972 submitHalf(reduction, numAtomsHome, 1, doEnergy || doVirial);
2976 this->update_patch_flags();
2979 finish_part1(copyIn, patchList[0]->flags.savePairlists,
2980 patchList[0]->flags.usePairlists, reduction);
2984 void SequencerCUDA::launch_set_compute_positions() {
2989 bool doNbond = patchData->flags.doNonbonded;
2990 bool doSlow = patchData->flags.doFullElectrostatics;
2994 bool doAlchDecouple =
false;
2995 bool doAlchSoftCore =
false;
2998 if (
simParams->alchThermIntOn) doTI =
true;
2999 if (
simParams->alchDecouple) doAlchDecouple =
true;
3000 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
3005 lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3008 bool usePatchPme =
false;
3021 getNumPatchesHome(),
3034 std::vector<int> atom_counts;
3036 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3038 CUDASequencerKernel->set_compute_positions(
3042 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3043 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3044 #ifdef NAMD_NCCL_ALLREDUCE 3045 (mGpuOn) ? d_posNew_x: d_pos_x,
3046 (mGpuOn) ? d_posNew_y: d_pos_y,
3047 (mGpuOn) ? d_posNew_z: d_pos_z,
3058 d_charge, d_partition, charge_scaling,
3060 patchData->devData[deviceIndex].slow_patchPositions,
3061 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3062 d_sortOrder, myLattice,
3063 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3064 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3066 patchData->devData[deviceIndex].d_localPatches,
3067 patchData->devData[deviceIndex].d_peerPatches,
3074 copyPositionsToHost_direct();
3081 void SequencerCUDA:: finish_part1(
const int copyIn,
3082 const int savePairlists,
3083 const int usePairlists,
3095 cudaCheck(cudaStreamSynchronize(stream));
3098 if(*h_periodicCellSmall){
3099 NAMD_die(
"Periodic cell has become too small for original patch grid!\n" 3100 "Possible solutions are to restart from a recent checkpoint,\n" 3101 "increase margin, or disable useFlexibleCell for liquid simulation.");
3108 double *vel_x, *vel_y, *vel_z;
3109 allocate_host<double>(&vel_x, numAtomsHome);
3110 allocate_host<double>(&vel_y, numAtomsHome);
3111 allocate_host<double>(&vel_z, numAtomsHome);
3112 copy_DtoH<double>(d_vel_x, vel_x, numAtomsHome);
3113 copy_DtoH<double>(d_vel_y, vel_y, numAtomsHome);
3114 copy_DtoH<double>(d_vel_z, vel_z, numAtomsHome);
3116 for (
int i=0; i < numAtomsHome; i++) {
3118 vel_x[i] * vel_x[i] + vel_y[i] * vel_y[i] + vel_z[i] * vel_z[i];
3119 if (vel2 > maxvel2) {
3127 << i <<
" of " << numAtomsHome
3128 <<
" pe " << CkMyPe() <<
")\n" <<
endi;
3131 iout <<
iERROR <<
"Atoms moving too fast at timestep " << patchList[0]->flags.step <<
3132 "; simulation has become unstable (" 3133 << cnt <<
" atoms on pe " << CkMyPe() <<
").\n" <<
endi;
3134 deallocate_host<double>(&vel_x);
3135 deallocate_host<double>(&vel_y);
3136 deallocate_host<double>(&vel_z);
3137 NAMD_die(
"SequencerCUDA: Atoms moving too fast");
3142 Tensor reduction_intVirialNormal;
3147 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3148 reduction_virial *= 0.5;
3152 += (intKineticEnergy_half[0] * 0.25);
3153 reduction_intVirialNormal *= 0.5;
3155 reduction_intVirialNormal);
3156 int migration = (h_marginViolations[0] != 0) ? 1 :0;
3158 patchData->migrationFlagPerDevice[deviceIndex] = migration;
3159 h_marginViolations[0] = 0;
3163 void SequencerCUDA::copyPositionsAndVelocitiesToHost(
3164 bool copyOut,
const int doGlobal){
3167 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3168 patchData = cpdata.ckLocalBranch();
3169 std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3170 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3171 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3172 const int numAtomsToCopy = numAtomsHome;
3173 copy_DtoH<double>(d_vel_x, vel_x, numAtomsToCopy, stream);
3174 copy_DtoH<double>(d_vel_y, vel_y, numAtomsToCopy, stream);
3175 copy_DtoH<double>(d_vel_z, vel_z, numAtomsToCopy, stream);
3178 copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3179 copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3180 copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3184 for(
int i = 0; i < homePatches.size(); i++){
3188 const int numPatchAtoms = localPatches[i].
numAtoms;
3189 const int offset = localPatches[i].bufferOffset;
3190 memcpy(current.
vel_x, vel_x + offset, numPatchAtoms*
sizeof(
double));
3191 memcpy(current.
vel_y, vel_y + offset, numPatchAtoms*
sizeof(
double));
3192 memcpy(current.
vel_z, vel_z + offset, numPatchAtoms*
sizeof(
double));
3195 memcpy(current.
pos_x, pos_x + offset, numPatchAtoms*
sizeof(
double));
3196 memcpy(current.
pos_y, pos_y + offset, numPatchAtoms*
sizeof(
double));
3197 memcpy(current.
pos_z, pos_z + offset, numPatchAtoms*
sizeof(
double));
3204 void SequencerCUDA::copyPositionsToHost(){
3206 CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
3207 patchData = cpdata.ckLocalBranch();
3208 std::vector<CudaPeerRecord>& myPeerPatches = patchData->devData[deviceIndex].h_peerPatches;
3209 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3210 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3212 const int numAtomsToCopy = numAtomsHome;
3214 copy_DtoH<double>(d_pos_x, pos_x, numAtomsToCopy, stream);
3215 copy_DtoH<double>(d_pos_y, pos_y, numAtomsToCopy, stream);
3216 copy_DtoH<double>(d_pos_z, pos_z, numAtomsToCopy, stream);
3219 for(
int i = 0; i < homePatches.size(); i++){
3223 const int numPatchAtoms = localPatches[i].
numAtoms;
3224 const int offset = localPatches[i].bufferOffset;
3225 memcpy(current.
pos_x, pos_x + offset, numPatchAtoms*
sizeof(
double));
3226 memcpy(current.
pos_y, pos_y + offset, numPatchAtoms*
sizeof(
double));
3227 memcpy(current.
pos_z, pos_z + offset, numPatchAtoms*
sizeof(
double));
3231 void SequencerCUDA::update_patch_flags()
3234 int pairlists = (patchData->flags.step <
simParams->N);
3235 for (
int i=0; i < numPatchesHome; i++) {
3241 void SequencerCUDA::updatePairlistFlags(
const int doMigration){
3242 int pairlists = patchList[0]->flags.step <
simParams->N;
3243 for(
int i = 0; i < numPatchesHome; i++){
3263 patch->doPairlistCheck_newTolerance *= (1 -
simParams->pairlistShrink);
3267 patch->doPairlistCheck_newTolerance = patchNewTolerance[i];
3274 if(patchList[0]->flags.savePairlists){
3276 copy_DtoD<double>(d_pos_x, d_posSave_x, numAtomsHome, stream);
3277 copy_DtoD<double>(d_pos_y, d_posSave_y, numAtomsHome, stream);
3278 copy_DtoD<double>(d_pos_z, d_posSave_z, numAtomsHome, stream);
3279 myLatticeOld = myLattice;
3283 void SequencerCUDA::finish_patch_flags(
int migration)
3285 for (
int i=0; i < numPatchesHome; i++) {
3299 void SequencerCUDA::launch_part2(
3300 const int doMCPressure,
3308 const int langevinPistonStep,
3312 const bool doEnergy)
3317 bool doNbond =
false;
3318 bool doSlow =
false;
3329 #ifdef NAMD_NCCL_ALLREDUCE 3330 cudaCheck(cudaMemset(d_f_raw, 0,
sizeof(
double)*numAtomsHomeAndProxy*3*(maxForceNumber+1)));
3339 (!is_lonepairs_psf)){
3340 CUDASequencerKernel->accumulate_force_kick(
3345 numPatchesHomeAndProxy,
3346 patchData->devData[deviceIndex].d_localPatches,
3347 patchData->devData[deviceIndex].f_bond,
3348 patchData->devData[deviceIndex].f_bond_nbond,
3349 patchData->devData[deviceIndex].f_bond_slow,
3350 patchData->devData[deviceIndex].forceStride,
3351 patchData->devData[deviceIndex].f_nbond,
3352 patchData->devData[deviceIndex].f_nbond_slow,
3353 patchData->devData[deviceIndex].f_slow,
3380 CUDASequencerKernel->accumulateForceToSOA(
3384 numPatchesHomeAndProxy,
3386 patchData->devData[deviceIndex].d_localPatches,
3387 patchData->devData[deviceIndex].f_bond,
3388 patchData->devData[deviceIndex].f_bond_nbond,
3389 patchData->devData[deviceIndex].f_bond_slow,
3390 patchData->devData[deviceIndex].forceStride,
3391 patchData->devData[deviceIndex].f_nbond,
3392 patchData->devData[deviceIndex].f_nbond_slow,
3393 patchData->devData[deviceIndex].f_slow,
3408 patchData->d_queues,
3409 patchData->d_queueCounters,
3423 void SequencerCUDA::launch_part3(
3424 const int doMCPressure,
3432 const bool requestGlobalForces,
3433 const int doGlobalStaleForces,
3434 const bool forceRequestedGPU,
3437 const bool doEnergy,
3438 const bool requestForcesOutput)
3441 const bool doFixed =
simParams->fixedAtomsOn;
3442 const double velrescaling = 1;
3449 #ifndef NAMD_NCCL_ALLREDUCE 3454 std::vector<int> atom_counts;
3456 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3458 CUDASequencerKernel->mergeForcesFromPeers(
3462 numPatchesHomeAndProxy,
3475 patchData->devData[deviceIndex].d_localPatches,
3476 patchData->devData[deviceIndex].d_peerPatches,
3481 int numReducedAtoms = (3 * (maxForceNumber+1)) * numAtoms;
3482 ncclAllReduce(d_f_raw, d_f_raw, numReducedAtoms, ncclDouble, ncclSum,
deviceCUDA->getNcclComm(), stream );
3485 if(doVirial && doGlobalStaleForces)
3487 memset(&extVirial[EXT_GLOBALMTS], 0,
sizeof(
cudaTensor));
3488 memset(&extForce[EXT_GLOBALMTS], 0,
sizeof(double3));
3489 computeGlobalMasterVirial(
3490 numPatchesHomeAndProxy,
3491 patchData->devData[deviceIndex].d_localPatches,
3499 &d_extForce[EXT_GLOBALMTS],
3500 &extForce[EXT_GLOBALMTS],
3501 &d_extVirial[EXT_GLOBALMTS],
3502 &extVirial[EXT_GLOBALMTS],
3506 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GLOBALMTS]);
3507 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GLOBALMTS]);
3509 calculateExternalForces(step, reduction, maxForceNumber, doEnergy, doVirial);
3512 if(
true || deviceID == 0){
3514 snprintf(prefix, 10,
"step-%d",step);
3515 this->printSOAForces(prefix);
3522 CUDASequencerKernel->langevinVelocitiesBBK1(
3523 dt_normal, d_langevinParam, d_vel_x, d_vel_y, d_vel_z, numAtomsHome, stream);
3530 CUDASequencerKernel->addForceToMomentum(
3531 doFixed, 1.0, dt_normal, dt_nbond, dt_slow, velrescaling,
3533 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3534 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3535 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3536 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3537 numAtomsHome, maxForceNumber, stream);
3545 CUDASequencerKernel->rattle1(
3546 simParams->fixedAtomsOn, doEnergy || doVirial,
3547 1, numAtomsHome, dt_normal, 1.0/dt_normal,
3549 d_vel_x, d_vel_y, d_vel_z,
3550 d_pos_x, d_pos_y, d_pos_z,
3551 d_velNew_x, d_velNew_y, d_velNew_z,
3552 d_posNew_x, d_posNew_y, d_posNew_z,
3553 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3554 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3555 &settleList, settleListSize, &d_consFailure,
3556 d_consFailureSize, &rattleList, rattleListSize,
3558 d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3559 buildRigidLists, consFailure,
simParams->watmodel, stream);
3560 buildRigidLists =
false;
3562 CUDASequencerKernel->langevinVelocitiesBBK2(
3563 dt_normal, d_langScalVelBBK2, d_langScalRandBBK2,
3564 d_gaussrand_x, d_gaussrand_y, d_gaussrand_z,
3565 d_vel_x, d_vel_y, d_vel_z,
3566 numAtomsHome, numAtomsHome, 0,
3570 CUDASequencerKernel->rattle1(
3571 simParams->fixedAtomsOn, doEnergy || doVirial,
3572 1, numAtomsHome, dt_normal, 1.0/dt_normal,
3574 d_vel_x, d_vel_y, d_vel_z,
3575 d_pos_x, d_pos_y, d_pos_z,
3576 d_velNew_x, d_velNew_y, d_velNew_z,
3577 d_posNew_x, d_posNew_y, d_posNew_z,
3578 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3579 d_hydrogenGroupSize, d_rigidBondLength, d_mass, d_atomFixed,
3580 &settleList, settleListSize, &d_consFailure,
3581 d_consFailureSize, &rattleList, rattleListSize,
3583 d_rigidVirial, rigidVirial, d_tbcatomic, copyIn, sp,
3584 buildRigidLists, consFailure,
simParams->watmodel, stream);
3585 buildRigidLists =
false;
3589 if(doEnergy || doVirial){
3590 CUDASequencerKernel->centerOfMass(
3591 d_vel_x, d_vel_y, d_vel_z,
3592 d_vcm_x, d_vcm_y, d_vcm_z,
3593 d_mass, d_hydrogenGroupSize, numAtomsHome, stream);
3596 submitHalf(reduction, numAtomsHome, 2, doEnergy || doVirial);
3598 CUDASequencerKernel->addForceToMomentum(
3599 doFixed, -0.5, dt_normal, dt_nbond, dt_slow, velrescaling,
3601 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3602 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3603 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3604 d_vel_x, d_vel_y, d_vel_z, d_atomFixed,
3605 numAtomsHome, maxForceNumber, stream);
3607 if(requestGlobalForces || requestForcesOutput) {
3610 saveForceCUDASOA_direct(requestGlobalForces, requestForcesOutput, maxForceNumber);
3613 if (forceRequestedGPU) {
3614 if (d_f_saved_nbond_x ==
nullptr) allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
3615 if (d_f_saved_nbond_y ==
nullptr) allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
3616 if (d_f_saved_nbond_z ==
nullptr) allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
3617 if (d_f_saved_slow_x ==
nullptr) allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
3618 if (d_f_saved_slow_y ==
nullptr) allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
3619 if (d_f_saved_slow_z ==
nullptr) allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
3620 CUDASequencerKernel->copyForcesToDevice(
3621 numAtomsHome, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3622 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3623 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
3624 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z, maxForceNumber, stream);
3629 submitReductions(reduction, origin.
x, origin.
y, origin.
z,
3630 marginViolations, doEnergy || doVirial,
3631 copyOut &&
simParams->outputMomenta != 0,
3632 numAtomsHome, maxForceNumber);
3635 copyPositionsAndVelocitiesToHost(copyOut, 0);
3643 NAMD_die(
"constraint failure during CUDA rattle!\n");
3645 iout <<
iWARN <<
"constraint failure during CUDA rattle!\n" <<
endi;
3647 }
else if(doEnergy || doVirial){
3648 cudaCheck(cudaStreamSynchronize(stream));
3650 Tensor reduction_rigidVirial;
3653 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_rigidVirial);
3659 Tensor reduction_intVirialNormal;
3664 if (!
simParams->fixedAtomsOn) tensor_enforce_symmetry(reduction_virial);
3665 reduction_virial *= 0.5;
3669 += (intKineticEnergy_half[0] * 0.25);
3670 reduction_intVirialNormal *= 0.5;
3672 reduction_intVirialNormal);
3676 Vector momentum(*momentum_x, *momentum_y, *momentum_z);
3678 Vector angularMomentum(*angularMomentum_x,
3680 *angularMomentum_z);
3683 Tensor regintVirialNormal;
3684 Tensor regintVirialNbond;
3687 if (maxForceNumber >= 1) {
3690 if (maxForceNumber >= 2) {
3700 cudaTensor fixVirialNormal, fixVirialNbond, fixVirialSlow;
3701 double3 fixForceNormal, fixForceNbond, fixForceSlow;
3702 switch (maxForceNumber) {
3704 copy_DtoH(d_fixVirialSlow, &fixVirialSlow, 1);
3705 copy_DtoH(d_fixForceSlow, &fixForceSlow, 1);
3709 cudaCheck(cudaMemset(d_fixForceSlow, 0, 1 *
sizeof(double3)));
3712 copy_DtoH(d_fixVirialNbond, &fixVirialNbond, 1);
3713 copy_DtoH(d_fixForceNbond, &fixForceNbond, 1);
3717 cudaCheck(cudaMemset(d_fixForceNbond, 0, 1 *
sizeof(double3)));
3720 copy_DtoH(d_fixVirialNormal, &fixVirialNormal, 1);
3721 copy_DtoH(d_fixForceNormal, &fixForceNormal, 1);
3725 cudaCheck(cudaMemset(d_fixForceNormal, 0, 1 *
sizeof(double3)));
3729 auto printTensor = [](
const cudaTensor& t,
const std::string& name){
3730 CkPrintf(
"%s", name.c_str());
3731 CkPrintf(
"\n%12.5lf %12.5lf %12.5lf\n" 3732 "%12.5lf %12.5lf %12.5lf\n" 3733 "%12.5lf %12.5lf %12.5lf\n",
3738 printTensor(fixVirialNormal,
"fixVirialNormal = ");
3739 printTensor(fixVirialNbond,
"fixVirialNbond = ");
3740 printTensor(fixVirialSlow,
"fixVirialSlow = ");
3749 void SequencerCUDA::atomUpdatePme()
3754 bool doNbond =
false;
3759 bool doAlchDecouple =
false;
3760 bool doAlchSoftCore =
false;
3763 if (
simParams->alchThermIntOn) doTI =
true;
3764 if (
simParams->alchDecouple) doAlchDecouple =
true;
3765 if (
simParams->alchElecLambdaStart > 0) doAlchSoftCore =
true;
3769 lonepairsKernel->reposition(d_pos_x, d_pos_y, d_pos_z, stream);
3772 bool usePatchPme =
false;
3785 getNumPatchesHome(),
3794 std::vector<int> atom_counts;
3796 atom_counts.push_back(patchData->devData[i].numAtomsHome);
3798 CUDASequencerKernel->set_pme_positions(
3802 numPatchesHomeAndProxy, numPatchesHome, doNbond, doSlow,
3803 doFEP, doTI, doAlchDecouple, doAlchSoftCore, !usePatchPme,
3804 #ifdef NAMD_NCCL_ALLREDUCE 3805 (mGpuOn) ? d_posNew_x: d_pos_x,
3806 (mGpuOn) ? d_posNew_y: d_pos_y,
3807 (mGpuOn) ? d_posNew_z: d_pos_z,
3818 d_charge, d_partition, charge_scaling,
3820 patchData->devData[deviceIndex].slow_patchPositions,
3821 patchData->devData[deviceIndex].slow_pencilPatchIndex, patchData->devData[deviceIndex].slow_patchID,
3822 d_sortOrder, myLattice,
3823 (float4*) patchData->devData[deviceIndex].nb_datoms, patchData->devData[deviceIndex].b_datoms,
3824 (float4*)patchData->devData[deviceIndex].s_datoms, patchData->devData[deviceIndex].s_datoms_partition,
3826 patchData->devData[deviceIndex].d_localPatches,
3827 patchData->devData[deviceIndex].d_peerPatches,
3831 cudaCheck(cudaStreamSynchronize(stream));
3836 void SequencerCUDA::sync() {
3837 cudaCheck(cudaStreamSynchronize(stream));
3840 void SequencerCUDA::calculateExternalForces(
3843 const int maxForceNumber,
3845 const int doVirial) {
3850 if (is_lonepairs_psf) {
3851 lonepairsKernel->redistributeForce(
3852 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3853 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
3854 d_f_slow_x, d_f_slow_y, d_f_slow_z,
3855 d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
3856 d_pos_x, d_pos_y, d_pos_z, maxForceNumber, doEnergy || doVirial, stream);
3859 if (is_tip4_water) {
3860 redistributeTip4pForces(reduction, maxForceNumber, doEnergy || doVirial);
3870 double efield_phi =
PI/180. *
simParams->eFieldPhase;
3873 CUDASequencerKernel->apply_Efield(numAtomsHome,
simParams->eFieldNormalized,
3874 doEnergy || doVirial, efield, efield_omega, efield_phi, t , myLattice, d_transform,
3875 d_charge, d_pos_x, d_pos_y, d_pos_z,
3876 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3877 &d_extForce[EXT_ELEC_FIELD], &d_extVirial[EXT_ELEC_FIELD],
3878 &d_extEnergy[EXT_ELEC_FIELD], &extForce[EXT_ELEC_FIELD],
3879 &extVirial[EXT_ELEC_FIELD], &extEnergy[EXT_ELEC_FIELD],
3880 d_tbcatomic, stream);
3884 restraintsKernel->doForce(&myLattice, doEnergy, doVirial, step,
3885 d_pos_x, d_pos_y, d_pos_z,
3886 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3887 &d_extEnergy[EXT_CONSTRAINTS], &extEnergy[EXT_CONSTRAINTS],
3888 &d_extForce[EXT_CONSTRAINTS], &extForce[EXT_CONSTRAINTS],
3889 &d_extVirial[EXT_CONSTRAINTS], &extVirial[EXT_CONSTRAINTS]);
3893 SMDKernel->doForce(step, myLattice, doEnergy || doVirial,
3894 d_mass, d_pos_x, d_pos_y, d_pos_z, d_transform,
3895 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3896 &d_extVirial[EXT_SMD], &extEnergy[EXT_SMD],
3897 &extForce[EXT_SMD], &extVirial[EXT_SMD], stream);
3901 groupRestraintsKernel->doForce(step, doEnergy, doVirial,
3902 myLattice, d_transform,
3903 d_mass, d_pos_x, d_pos_y, d_pos_z,
3904 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3905 &d_extVirial[EXT_GROUP_RESTRAINTS], &extEnergy[EXT_GROUP_RESTRAINTS],
3906 &extForce[EXT_GROUP_RESTRAINTS], &extVirial[EXT_GROUP_RESTRAINTS], stream);
3909 gridForceKernel->doForce(doEnergy, doVirial,
3911 d_pos_x, d_pos_y, d_pos_z, d_transform,
3912 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3916 consForceKernel->doForce(myLattice, doVirial,
3917 d_pos_x, d_pos_y, d_pos_z,
3918 d_f_normal_x, d_f_normal_y, d_f_normal_z,
3920 &d_extForce[EXT_CONSFORCE], &extForce[EXT_CONSFORCE],
3921 &d_extVirial[EXT_CONSFORCE], &extVirial[EXT_CONSFORCE], stream);
3924 if(doEnergy || doVirial) {
3926 cudaCheck(cudaStreamSynchronize(stream));
3927 if (is_lonepairs_psf || is_tip4_water) {
3928 switch (maxForceNumber) {
3930 copy_DtoH_sync<cudaTensor>(d_lpVirialSlow, lpVirialSlow, 1);
3934 copy_DtoH_sync<cudaTensor>(d_lpVirialNbond, lpVirialNbond, 1);
3938 copy_DtoH_sync<cudaTensor>(d_lpVirialNormal, lpVirialNormal, 1);
3946 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_ELEC_FIELD]);
3947 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_ELEC_FIELD]);
3953 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSTRAINTS]);
3954 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSTRAINTS]);
3965 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GROUP_RESTRAINTS]);
3966 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GROUP_RESTRAINTS]);
3971 gridForceKernel->sumEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
3973 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_GRIDFORCE]);
3974 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_GRIDFORCE]);
3975 gridForceKernel->zeroOutEnergyVirialForcesAcrossGrids(&extEnergy[EXT_GRIDFORCE], &extForce[EXT_GRIDFORCE], &extVirial[EXT_GRIDFORCE]);
3979 ADD_VECTOR_OBJECT(reduction, REDUCTION_EXT_FORCE_NORMAL, extForce[EXT_CONSFORCE]);
3980 ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NORMAL, extVirial[EXT_CONSFORCE]);
3986 void SequencerCUDA::copyGlobalForcesToDevice(){
3989 std::vector<CudaLocalRecord>& localPatches = patchData->devData[deviceIndex].h_localPatches;
3991 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
3993 for(
int i =0 ; i < numPatchesHome; i++){
3995 const int patchID = record.
patchID;
3997 const int numPatchAtoms = record.
numAtoms;
3999 memcpy(f_global_x + stride, current.
f_global_x, numPatchAtoms*
sizeof(
double));
4000 memcpy(f_global_y + stride, current.
f_global_y, numPatchAtoms*
sizeof(
double));
4001 memcpy(f_global_z + stride, current.
f_global_z, numPatchAtoms*
sizeof(
double));
4004 copy_HtoD<double>(f_global_x, d_f_global_x, numAtomsHome, stream);
4005 copy_HtoD<double>(f_global_y, d_f_global_y, numAtomsHome, stream);
4006 copy_HtoD<double>(f_global_z, d_f_global_z, numAtomsHome, stream);
4010 void SequencerCUDA::updateHostPatchDataSOA() {
4011 std::vector<PatchDataSOA> host_copy(numPatchesHome);
4012 std::vector<HomePatch*>& homePatches = patchData->devData[deviceIndex].patches;
4014 for(
int i =0 ; i < numPatchesHome; i++) {
4015 host_copy[i] = homePatches[i]->patchDataSOA;
4017 copy_HtoD<PatchDataSOA>(host_copy.data(), d_HostPatchDataSOA, numPatchesHome);
4021 void SequencerCUDA::saveForceCUDASOA_direct(
4022 const bool doGlobal,
const bool doForcesOutput,
const int maxForceNumber) {
4023 CUDASequencerKernel->copyForcesToHostSOA(
4025 patchData->devData[deviceIndex].d_localPatches,
4041 cudaCheck(cudaStreamSynchronize(stream));
4044 void SequencerCUDA::copyPositionsToHost_direct() {
4045 CUDASequencerKernel->copyPositionsToHostSOA(
4047 patchData->devData[deviceIndex].d_localPatches,
4054 cudaCheck(cudaStreamSynchronize(stream));
4057 void SequencerCUDA::redistributeTip4pForces(
4059 const int maxForceNumber,
4060 const int doVirial) {
4061 CUDASequencerKernel->redistributeTip4pForces(
4062 d_f_normal_x, d_f_normal_y, d_f_normal_z,
4063 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
4064 d_f_slow_x, d_f_slow_y, d_f_slow_z,
4065 d_lpVirialNormal, d_lpVirialNbond, d_lpVirialSlow,
4066 d_pos_x, d_pos_y, d_pos_z, d_mass,
4067 numAtomsHome, doVirial, maxForceNumber, stream
4071 void SequencerCUDA::allocateGPUSavedForces() {
4072 allocate_device<double>(&d_f_saved_nbond_x, numAtomsHomeAndProxyAllocated);
4073 allocate_device<double>(&d_f_saved_nbond_y, numAtomsHomeAndProxyAllocated);
4074 allocate_device<double>(&d_f_saved_nbond_z, numAtomsHomeAndProxyAllocated);
4075 allocate_device<double>(&d_f_saved_slow_x, numAtomsHomeAndProxyAllocated);
4076 allocate_device<double>(&d_f_saved_slow_y, numAtomsHomeAndProxyAllocated);
4077 allocate_device<double>(&d_f_saved_slow_z, numAtomsHomeAndProxyAllocated);
NAMD_HOST_DEVICE void rescale(Tensor factor)
#define NAMD_EVENT_STOP(eon, id)
NAMD_HOST_DEVICE Vector c() const
int periodic_a(void) const
#define curandCheck(stmt)
static BigReal dielectric_1
static void partition(int *order, const FullAtom *atoms, int begin, int end)
static PatchMap * Object()
int periodic_c(void) const
BigReal alchElecLambdaStart
#define ADD_TENSOR_OBJECT(R, RL, D)
void deallocate_device(T **pp)
int32 * moleculeAtom
atom index for all molecules
HomePatchList * homePatchList()
std::ostream & endi(std::ostream &s)
void checkPatchLevelLatticeCompatibilityAndComputeOffsets(const Lattice &lattice, const int numPatches, const CudaLocalRecord *localRecords, double3 *patchMin, double3 *patchMax, double3 *awayDists)
std::ostream & iWARN(std::ostream &s)
Patch * patch(PatchID pid)
#define COPY_CUDATENSOR(S, D)
static PatchMap * ObjectOnPe(int pe)
HomePatch * homePatch(PatchID pid)
PatchLevelPmeData patchLevelPmeData
Molecule stores the structural information for the system.
void setupDevicePeerAccess()
__thread DeviceCUDA * deviceCUDA
void updateAtomCount(const int n, const int reallocate)
FullAtomList & getAtomList()
void copyIntFlags(const Flags &flags)
NAMD_HOST_DEVICE double3 make_double3(float3 a)
int numPatches(void) const
#define NAMD_EVENT_START(eon, id)
static AtomMap * ObjectOnPe(int pe)
int numLargeMolecules
Number of large molecules (compare to LARGEMOLTH)
void NAMD_bug(const char *err_msg)
#define COPY_CUDAVECTOR(S, D)
static ComputeCUDAMgr * getComputeCUDAMgr()
ReductionValue & item(int index)
double * vel_x
Jim recommends double precision velocity.
int numMolecules
Number of molecules.
void allocate_host(T **pp, const size_t len)
PatchID getPatchID() const
int getPesSharingDevice(const int i)
NAMD_HOST_DEVICE Vector a_r() const
NAMD_HOST_DEVICE Vector b_r() const
void NAMD_die(const char *err_msg)
int periodic_b(void) const
NAMD_HOST_DEVICE Vector c_r() const
NAMD_HOST_DEVICE Vector b() const
void copy_DtoH(const T *d_array, T *h_array, const size_t array_len, cudaStream_t stream=0)
#define ADD_VECTOR_OBJECT(R, RL, D)
int32 * moleculeStartIndex
starting index of each molecule
BigReal pairlistTolerance
size_t alchGetNumOfPMEGrids() const
std::ostream & iERROR(std::ostream &s)
NAMD_HOST_DEVICE Vector a() const
#define namd_reciprocal(x)
int getDeviceIDforPe(int pe)
int getNumPesSharingDevice()
SimParameters *const simParams
NAMD_HOST_DEVICE Vector unit(void) const
CudaPmeOneDevice * getCudaPmeOneDevice()
int32 numAtoms
number of atoms