1 #include "CudaGlobalMasterServerKernel.h"
4 #if (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)
6 #define ATOM_BLOCKS 128
8 template <bool copyPositions, bool copyMasses, bool copyCharges,
9 bool copyTransforms, bool copyVelocities>
10 __global__ void copyAtomsToClientsKernel(
11 const double *__restrict x, const double *__restrict y,
12 const double *__restrict z, const double *__restrict v_x,
13 const double *__restrict v_y, const double *__restrict v_z,
14 const char3 *__restrict d_transform, const float *__restrict d_mass,
15 const float *__restrict d_charge, const Lattice lat,
16 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
18 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
19 const int i = threadIdx.x + blockIdx.x * blockDim.x;
20 // double3 pos = {0, 0, 0};
21 if (i < numCopyTuples) {
22 const int srcIndex = d_copyList[i].m_soa_index;
23 const int dstClient = d_copyList[i].m_client_index;
24 const int dstIndex = d_copyList[i].m_client_atom_pos;
25 const size_t stride = d_clientBuffers[dstClient].sz;
27 const double3 pos = {x[srcIndex], y[srcIndex], z[srcIndex]};
28 const char3 t = d_transform[srcIndex];
29 const double3 pos_trans = lat.reverse_transform(pos, t);
30 d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
31 d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
32 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
35 d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcIndex];
38 d_clientBuffers[dstClient].d_charge[dstIndex] = d_charge[srcIndex];
41 d_clientBuffers[dstClient].d_transform[dstIndex] =
42 d_transform[srcIndex].x;
43 d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
44 d_transform[srcIndex].y;
45 d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
46 d_transform[srcIndex].z;
49 d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcIndex];
50 d_clientBuffers[dstClient].d_vel[dstIndex + stride] = v_y[srcIndex];
51 d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] = v_z[srcIndex];
56 void copyAtomsToClientsCUDA(
57 bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
58 bool copyVelocities, const double *d_pos_x, const double *d_pos_y,
59 const double *d_pos_z, const double *d_vel_x, const double *d_vel_y,
60 const double *d_vel_z, const char3 *d_transform, const float *d_mass,
61 const float *d_charge, const Lattice lat,
62 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
63 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
64 size_t numClients, cudaStream_t stream) {
65 if (numCopyTuples == 0) return;
66 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
67 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
68 copyAtomsToClientsKernel<bool(POSITION), bool(MASS), bool(CHARGE), \
69 bool(TRANSFORM), bool(VELOCITY)> \
70 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
71 d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z, d_transform, \
72 d_mass, d_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
73 const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
74 (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
75 (int(copyVelocities));
78 break; // Nothing is copied
176 template <bool copyPositions, bool copyMasses, bool copyCharges,
177 bool copyTransforms, bool copyVelocities>
178 __global__ void copyAtomsToClientsKernelMGPU(
179 const double **__restrict x, const double **__restrict y,
180 const double **__restrict z, const double **__restrict v_x,
181 const double **__restrict v_y, const double **__restrict v_z,
182 const char3 **__restrict d_transform, const float **__restrict d_mass,
183 const float **__restrict d_charge, const Lattice lat,
184 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
185 size_t numCopyTuples,
186 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
187 const int i = threadIdx.x + blockIdx.x * blockDim.x;
188 // double3 pos = {0, 0, 0};
189 if (i < numCopyTuples) {
190 const int srcDevIdx = d_copyList[i].m_src_dev_index;
191 const int srcIndex = d_copyList[i].m_soa_index;
192 const int dstClient = d_copyList[i].m_client_index;
193 const int dstIndex = d_copyList[i].m_client_atom_pos;
194 const size_t stride = d_clientBuffers[dstClient].sz;
196 const double3 pos = {x[srcDevIdx][srcIndex], y[srcDevIdx][srcIndex],
197 z[srcDevIdx][srcIndex]};
198 const char3 t = d_transform[srcDevIdx][srcIndex];
199 const double3 pos_trans = lat.reverse_transform(pos, t);
200 d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
201 d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
202 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
205 d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcDevIdx][srcIndex];
208 d_clientBuffers[dstClient].d_charge[dstIndex] =
209 d_charge[srcDevIdx][srcIndex];
211 if (copyTransforms) {
212 d_clientBuffers[dstClient].d_transform[dstIndex] =
213 d_transform[srcDevIdx][srcIndex].x;
214 d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
215 d_transform[srcDevIdx][srcIndex].y;
216 d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
217 d_transform[srcDevIdx][srcIndex].z;
219 if (copyVelocities) {
220 d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcDevIdx][srcIndex];
221 d_clientBuffers[dstClient].d_vel[dstIndex + stride] =
222 v_y[srcDevIdx][srcIndex];
223 d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] =
224 v_z[srcDevIdx][srcIndex];
229 void copyAtomsToClientsCUDAMGPU(
230 bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
231 bool copyVelocities, const double **d_peer_pos_x,
232 const double **d_peer_pos_y, const double **d_peer_pos_z,
233 const double **d_peer_vel_x, const double **d_peer_vel_y,
234 const double **d_peer_vel_z, const char3 **d_peer_transform,
235 const float **d_peer_mass, const float **d_peer_charge, const Lattice lat,
236 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
237 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
238 size_t numClients, cudaStream_t stream) {
239 if (numCopyTuples == 0) return;
240 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
241 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
242 copyAtomsToClientsKernelMGPU<bool(POSITION), bool(MASS), bool(CHARGE), \
243 bool(TRANSFORM), bool(VELOCITY)> \
244 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
245 d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_vel_x, \
246 d_peer_vel_y, d_peer_vel_z, d_peer_transform, d_peer_mass, \
247 d_peer_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
248 const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
249 (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
250 (int(copyVelocities));
253 break; // Nothing is copied
351 template <bool FIXEDATOM>
352 __global__ void copyForcesToClientsKernel(
353 const double *__restrict d_f_normal_x,
354 const double *__restrict d_f_normal_y,
355 const double *__restrict d_f_normal_z, const double *__restrict d_f_nbond_x,
356 const double *__restrict d_f_nbond_y, const double *__restrict d_f_nbond_z,
357 const double *__restrict d_f_slow_x, const double *__restrict d_f_slow_y,
358 const double *__restrict d_f_slow_z, const int *__restrict d_atomFixed,
359 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
360 size_t numCopyTuples,
361 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
362 const int i = threadIdx.x + blockIdx.x * blockDim.x;
363 if (i < numCopyTuples) {
364 const int srcIndex = d_copyList[i].m_soa_index;
365 const int dstClient = d_copyList[i].m_client_index;
366 const int dstIndex = d_copyList[i].m_client_atom_pos;
367 const size_t stride = d_clientBuffers[dstClient].sz;
370 if (d_atomFixed[srcIndex]) {
373 fx = d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] +
374 d_f_slow_x[srcIndex];
375 fy = d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] +
376 d_f_slow_y[srcIndex];
377 fz = d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] +
378 d_f_slow_z[srcIndex];
382 d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] + d_f_slow_x[srcIndex];
384 d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] + d_f_slow_y[srcIndex];
386 d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] + d_f_slow_z[srcIndex];
388 d_clientBuffers[dstClient].d_data[dstIndex] = fx;
389 d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
390 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
394 void copyTotalForcesToClientsCUDA(
395 bool fixedOn, const double *d_f_normal_x, const double *d_f_normal_y,
396 const double *d_f_normal_z, const double *d_f_nbond_x,
397 const double *d_f_nbond_y, const double *d_f_nbond_z,
398 const double *d_f_slow_x, const double *d_f_slow_y,
399 const double *d_f_slow_z, const int *d_atomFixed,
400 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
401 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
402 size_t numClients, cudaStream_t stream) {
403 if (numCopyTuples == 0) return;
404 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
406 copyForcesToClientsKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
407 d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
408 d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
409 d_copyList, numCopyTuples, d_clientBuffers);
411 copyForcesToClientsKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
412 d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
413 d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
414 d_copyList, numCopyTuples, d_clientBuffers);
418 template <bool FIXEDATOM>
419 __global__ void copyForcesToClientsKernelMGPU(
420 const double **__restrict d_f_normal_x,
421 const double **__restrict d_f_normal_y,
422 const double **__restrict d_f_normal_z,
423 const double **__restrict d_f_nbond_x,
424 const double **__restrict d_f_nbond_y,
425 const double **__restrict d_f_nbond_z, const double **__restrict d_f_slow_x,
426 const double **__restrict d_f_slow_y, const double **__restrict d_f_slow_z,
427 const int **__restrict d_atomFixed,
428 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
429 size_t numCopyTuples,
430 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
431 const int i = threadIdx.x + blockIdx.x * blockDim.x;
432 if (i < numCopyTuples) {
433 const int srcDevIdx = d_copyList[i].m_src_dev_index;
434 const int srcIndex = d_copyList[i].m_soa_index;
435 const int dstClient = d_copyList[i].m_client_index;
436 const int dstIndex = d_copyList[i].m_client_atom_pos;
437 const size_t stride = d_clientBuffers[dstClient].sz;
440 if (d_atomFixed[srcDevIdx][srcIndex]) {
443 fx = d_f_normal_x[srcDevIdx][srcIndex] +
444 d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
445 fy = d_f_normal_y[srcDevIdx][srcIndex] +
446 d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
447 fz = d_f_normal_z[srcDevIdx][srcIndex] +
448 d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
451 fx = d_f_normal_x[srcDevIdx][srcIndex] +
452 d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
453 fy = d_f_normal_y[srcDevIdx][srcIndex] +
454 d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
455 fz = d_f_normal_z[srcDevIdx][srcIndex] +
456 d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
458 d_clientBuffers[dstClient].d_data[dstIndex] = fx;
459 d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
460 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
464 void copyTotalForcesToClientsCUDAMGPU(
465 bool fixedOn, const double **d_peer_f_normal_x,
466 const double **d_peer_f_normal_y, const double **d_peer_f_normal_z,
467 const double **d_peer_f_nbond_x, const double **d_peer_f_nbond_y,
468 const double **d_peer_f_nbond_z, const double **d_peer_f_slow_x,
469 const double **d_peer_f_slow_y, const double **d_peer_f_slow_z,
470 const int **d_peer_atomFixed,
471 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
472 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
473 size_t numClients, cudaStream_t stream) {
474 if (numCopyTuples == 0) return;
475 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
477 copyForcesToClientsKernelMGPU<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
478 d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
479 d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
480 d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
481 numCopyTuples, d_clientBuffers);
483 copyForcesToClientsKernelMGPU<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
484 d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
485 d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
486 d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
487 numCopyTuples, d_clientBuffers);
491 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
492 __global__ void addGlobalForcesFromClientsKernel(
493 double *__restrict d_f_global_x, double *__restrict d_f_global_y,
494 double *__restrict d_f_global_z, const int *__restrict d_atomFixed,
495 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
496 size_t numCopyTuples,
497 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
498 const int i = threadIdx.x + blockIdx.x * blockDim.x;
499 if (i < numCopyTuples) {
500 const int dstIndex = d_copyList[i].m_soa_index;
501 const int srcClient = d_copyList[i].m_client_index;
502 const int srcIndex = d_copyList[i].m_client_atom_pos;
503 const size_t stride = d_clientBuffers[srcClient].sz;
506 if (d_atomFixed[dstIndex]) {
509 fx = d_clientBuffers[srcClient].d_data[srcIndex];
510 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
511 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
514 fx = d_clientBuffers[srcClient].d_data[srcIndex];
515 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
516 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
519 d_f_global_x[dstIndex] += fx;
520 d_f_global_y[dstIndex] += fy;
521 d_f_global_z[dstIndex] += fz;
523 atomicAdd(&(d_f_global_x[dstIndex]), fx);
524 atomicAdd(&(d_f_global_y[dstIndex]), fy);
525 atomicAdd(&(d_f_global_z[dstIndex]), fz);
530 void addGlobalForcesFromClients(
531 const int fixedOn, const int uniqueAtoms,
532 double *d_f_global_x, double *d_f_global_y,
533 double *d_f_global_z, const int *__restrict d_atomFixed,
534 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
535 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
536 size_t numClients, cudaStream_t stream) {
537 if (numCopyTuples == 0) return;
538 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
539 const int options = (fixedOn << 1) + uniqueAtoms;
540 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
541 addGlobalForcesFromClientsKernel<FIXEDATOM, UNIQUE_ATOMS> \
542 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
543 d_f_global_x, d_f_global_y, d_f_global_z, d_atomFixed, d_copyList, \
544 numCopyTuples, d_clientBuffers)
546 case ((0<<1) + 0): CALL(0, 0); break;
547 case ((0<<1) + 1): CALL(0, 1); break;
548 case ((1<<1) + 0): CALL(1, 0); break;
549 case ((1<<1) + 1): CALL(1, 1); break;
551 const std::string error = "Error in addGlobalForcesFromClients. No kernel is called.\n";
552 NAMD_bug(error.c_str());
558 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
559 __global__ void addGlobalForcesFromClientsKernelMGPU(
560 double **__restrict d_f_global_x, double **__restrict d_f_global_y,
561 double **__restrict d_f_global_z, const int **__restrict d_atomFixed,
562 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
563 size_t numCopyTuples,
564 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
565 const int i = threadIdx.x + blockIdx.x * blockDim.x;
566 if (i < numCopyTuples) {
567 const int dstDev = d_copyList[i].m_src_dev_index;
568 const int dstIndex = d_copyList[i].m_soa_index;
569 const int srcClient = d_copyList[i].m_client_index;
570 const int srcIndex = d_copyList[i].m_client_atom_pos;
571 const size_t stride = d_clientBuffers[srcClient].sz;
574 if (d_atomFixed[dstIndex]) {
577 fx = d_clientBuffers[srcClient].d_data[srcIndex];
578 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
579 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
582 fx = d_clientBuffers[srcClient].d_data[srcIndex];
583 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
584 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
587 d_f_global_x[dstDev][dstIndex] += fx;
588 d_f_global_y[dstDev][dstIndex] += fy;
589 d_f_global_z[dstDev][dstIndex] += fz;
591 atomicAdd(&(d_f_global_x[dstDev][dstIndex]), fx);
592 atomicAdd(&(d_f_global_y[dstDev][dstIndex]), fy);
593 atomicAdd(&(d_f_global_z[dstDev][dstIndex]), fz);
598 void addGlobalForcesFromClientsMGPU(
599 const int fixedOn, const int uniqueAtoms,
600 double **d_peer_f_global_x, double **d_peer_f_global_y,
601 double **d_peer_f_global_z, const int **d_peer_atomFixed,
602 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
603 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
604 size_t numClients, cudaStream_t stream) {
605 if (numCopyTuples == 0) return;
606 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
607 const int options = (fixedOn << 1) + uniqueAtoms;
608 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
609 addGlobalForcesFromClientsKernelMGPU<FIXEDATOM, UNIQUE_ATOMS>\
610 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
611 d_peer_f_global_x, d_peer_f_global_y, d_peer_f_global_z, \
612 d_peer_atomFixed, d_copyList, numCopyTuples, d_clientBuffers);
614 case ((0<<1) + 0): CALL(0, 0); break;
615 case ((0<<1) + 1): CALL(0, 1); break;
616 case ((1<<1) + 0): CALL(1, 0); break;
617 case ((1<<1) + 1): CALL(1, 1); break;
619 const std::string error = "Error in addGlobalForcesFromClientsMGPU. No kernel is called.\n";
620 NAMD_bug(error.c_str());
626 #endif // (defined(NAMD_CUDA) || defined(NAMD_HIP)) && defined(NODEGROUP_FORCE_REGISTER)