1 #include "CudaGlobalMasterServerKernel.h"
4 #if defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)
6 #define ATOM_BLOCKS 128
8 template <bool copyPositions, bool copyMasses, bool copyCharges,
9 bool copyTransforms, bool copyVelocities>
10 __global__ void copyAtomsToClientsKernel(
11 const double *__restrict x, const double *__restrict y,
12 const double *__restrict z, const double *__restrict v_x,
13 const double *__restrict v_y, const double *__restrict v_z,
14 const char3 *__restrict d_transform, const float *__restrict d_mass,
15 const float *__restrict d_charge, const Lattice lat,
16 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
18 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
19 const int i = threadIdx.x + blockIdx.x * blockDim.x;
20 // double3 pos = {0, 0, 0};
21 if (i < numCopyTuples) {
22 const int srcIndex = d_copyList[i].m_soa_index;
23 const int dstClient = d_copyList[i].m_client_index;
24 const int dstIndex = d_copyList[i].m_client_atom_pos;
25 const size_t stride = d_clientBuffers[dstClient].sz;
27 const double3 pos = {x[srcIndex], y[srcIndex], z[srcIndex]};
28 const char3 t = d_transform[srcIndex];
29 const double3 pos_trans = lat.reverse_transform(pos, t);
30 d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
31 d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
32 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
35 d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcIndex];
38 d_clientBuffers[dstClient].d_charge[dstIndex] = d_charge[srcIndex];
41 d_clientBuffers[dstClient].d_transform[dstIndex] =
42 d_transform[srcIndex].x;
43 d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
44 d_transform[srcIndex].y;
45 d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
46 d_transform[srcIndex].z;
49 d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcIndex];
50 d_clientBuffers[dstClient].d_vel[dstIndex + stride] = v_y[srcIndex];
51 d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] = v_z[srcIndex];
56 void copyAtomsToClientsCUDA(
57 bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
58 bool copyVelocities, const double *d_pos_x, const double *d_pos_y,
59 const double *d_pos_z, const double *d_vel_x, const double *d_vel_y,
60 const double *d_vel_z, const char3 *d_transform, const float *d_mass,
61 const float *d_charge, const Lattice lat,
62 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
63 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
64 size_t numClients, cudaStream_t stream) {
65 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
66 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
67 copyAtomsToClientsKernel<bool(POSITION), bool(MASS), bool(CHARGE), \
68 bool(TRANSFORM), bool(VELOCITY)> \
69 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
70 d_pos_x, d_pos_y, d_pos_z, d_vel_x, d_vel_y, d_vel_z, d_transform, \
71 d_mass, d_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
72 const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
73 (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
74 (int(copyVelocities));
77 break; // Nothing is copied
175 template <bool copyPositions, bool copyMasses, bool copyCharges,
176 bool copyTransforms, bool copyVelocities>
177 __global__ void copyAtomsToClientsKernelMGPU(
178 const double **__restrict x, const double **__restrict y,
179 const double **__restrict z, const double **__restrict v_x,
180 const double **__restrict v_y, const double **__restrict v_z,
181 const char3 **__restrict d_transform, const float **__restrict d_mass,
182 const float **__restrict d_charge, const Lattice lat,
183 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
184 size_t numCopyTuples,
185 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
186 const int i = threadIdx.x + blockIdx.x * blockDim.x;
187 // double3 pos = {0, 0, 0};
188 if (i < numCopyTuples) {
189 const int srcDevIdx = d_copyList[i].m_src_dev_index;
190 const int srcIndex = d_copyList[i].m_soa_index;
191 const int dstClient = d_copyList[i].m_client_index;
192 const int dstIndex = d_copyList[i].m_client_atom_pos;
193 const size_t stride = d_clientBuffers[dstClient].sz;
195 const double3 pos = {x[srcDevIdx][srcIndex], y[srcDevIdx][srcIndex],
196 z[srcDevIdx][srcIndex]};
197 const char3 t = d_transform[srcDevIdx][srcIndex];
198 const double3 pos_trans = lat.reverse_transform(pos, t);
199 d_clientBuffers[dstClient].d_data[dstIndex] = pos_trans.x;
200 d_clientBuffers[dstClient].d_data[dstIndex + stride] = pos_trans.y;
201 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = pos_trans.z;
204 d_clientBuffers[dstClient].d_mass[dstIndex] = d_mass[srcDevIdx][srcIndex];
207 d_clientBuffers[dstClient].d_charge[dstIndex] =
208 d_charge[srcDevIdx][srcIndex];
210 if (copyTransforms) {
211 d_clientBuffers[dstClient].d_transform[dstIndex] =
212 d_transform[srcDevIdx][srcIndex].x;
213 d_clientBuffers[dstClient].d_transform[dstIndex + stride] =
214 d_transform[srcDevIdx][srcIndex].y;
215 d_clientBuffers[dstClient].d_transform[dstIndex + 2 * stride] =
216 d_transform[srcDevIdx][srcIndex].z;
218 if (copyVelocities) {
219 d_clientBuffers[dstClient].d_vel[dstIndex] = v_x[srcDevIdx][srcIndex];
220 d_clientBuffers[dstClient].d_vel[dstIndex + stride] =
221 v_y[srcDevIdx][srcIndex];
222 d_clientBuffers[dstClient].d_vel[dstIndex + 2 * stride] =
223 v_z[srcDevIdx][srcIndex];
228 void copyAtomsToClientsCUDAMGPU(
229 bool copyPositions, bool copyMasses, bool copyCharges, bool copyTransforms,
230 bool copyVelocities, const double **d_peer_pos_x,
231 const double **d_peer_pos_y, const double **d_peer_pos_z,
232 const double **d_peer_vel_x, const double **d_peer_vel_y,
233 const double **d_peer_vel_z, const char3 **d_peer_transform,
234 const float **d_peer_mass, const float **d_peer_charge, const Lattice lat,
235 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
236 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
237 size_t numClients, cudaStream_t stream) {
238 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
239 #define CALL(POSITION, MASS, CHARGE, TRANSFORM, VELOCITY) \
240 copyAtomsToClientsKernelMGPU<bool(POSITION), bool(MASS), bool(CHARGE), \
241 bool(TRANSFORM), bool(VELOCITY)> \
242 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
243 d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_vel_x, \
244 d_peer_vel_y, d_peer_vel_z, d_peer_transform, d_peer_mass, \
245 d_peer_charge, lat, d_copyList, numCopyTuples, d_clientBuffers);
246 const int options = (int(copyPositions) << 4) + (int(copyMasses) << 3) +
247 (int(copyCharges) << 2) + (int(copyTransforms) << 1) +
248 (int(copyVelocities));
251 break; // Nothing is copied
349 template <bool FIXEDATOM>
350 __global__ void copyForcesToClientsKernel(
351 const double *__restrict d_f_normal_x,
352 const double *__restrict d_f_normal_y,
353 const double *__restrict d_f_normal_z, const double *__restrict d_f_nbond_x,
354 const double *__restrict d_f_nbond_y, const double *__restrict d_f_nbond_z,
355 const double *__restrict d_f_slow_x, const double *__restrict d_f_slow_y,
356 const double *__restrict d_f_slow_z, const int *__restrict d_atomFixed,
357 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
358 size_t numCopyTuples,
359 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
360 const int i = threadIdx.x + blockIdx.x * blockDim.x;
361 if (i < numCopyTuples) {
362 const int srcIndex = d_copyList[i].m_soa_index;
363 const int dstClient = d_copyList[i].m_client_index;
364 const int dstIndex = d_copyList[i].m_client_atom_pos;
365 const size_t stride = d_clientBuffers[dstClient].sz;
368 if (d_atomFixed[srcIndex]) {
371 fx = d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] +
372 d_f_slow_x[srcIndex];
373 fy = d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] +
374 d_f_slow_y[srcIndex];
375 fz = d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] +
376 d_f_slow_z[srcIndex];
380 d_f_normal_x[srcIndex] + d_f_nbond_x[srcIndex] + d_f_slow_x[srcIndex];
382 d_f_normal_y[srcIndex] + d_f_nbond_y[srcIndex] + d_f_slow_y[srcIndex];
384 d_f_normal_z[srcIndex] + d_f_nbond_z[srcIndex] + d_f_slow_z[srcIndex];
386 d_clientBuffers[dstClient].d_data[dstIndex] = fx;
387 d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
388 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
392 void copyTotalForcesToClientsCUDA(
393 bool fixedOn, const double *d_f_normal_x, const double *d_f_normal_y,
394 const double *d_f_normal_z, const double *d_f_nbond_x,
395 const double *d_f_nbond_y, const double *d_f_nbond_z,
396 const double *d_f_slow_x, const double *d_f_slow_y,
397 const double *d_f_slow_z, const int *d_atomFixed,
398 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
399 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
400 size_t numClients, cudaStream_t stream) {
401 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
403 copyForcesToClientsKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
404 d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
405 d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
406 d_copyList, numCopyTuples, d_clientBuffers);
408 copyForcesToClientsKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
409 d_f_normal_x, d_f_normal_y, d_f_normal_z, d_f_nbond_x, d_f_nbond_y,
410 d_f_nbond_z, d_f_slow_x, d_f_slow_y, d_f_slow_z, d_atomFixed,
411 d_copyList, numCopyTuples, d_clientBuffers);
415 template <bool FIXEDATOM>
416 __global__ void copyForcesToClientsKernelMGPU(
417 const double **__restrict d_f_normal_x,
418 const double **__restrict d_f_normal_y,
419 const double **__restrict d_f_normal_z,
420 const double **__restrict d_f_nbond_x,
421 const double **__restrict d_f_nbond_y,
422 const double **__restrict d_f_nbond_z, const double **__restrict d_f_slow_x,
423 const double **__restrict d_f_slow_y, const double **__restrict d_f_slow_z,
424 const int **__restrict d_atomFixed,
425 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
426 size_t numCopyTuples,
427 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
428 const int i = threadIdx.x + blockIdx.x * blockDim.x;
429 if (i < numCopyTuples) {
430 const int srcDevIdx = d_copyList[i].m_src_dev_index;
431 const int srcIndex = d_copyList[i].m_soa_index;
432 const int dstClient = d_copyList[i].m_client_index;
433 const int dstIndex = d_copyList[i].m_client_atom_pos;
434 const size_t stride = d_clientBuffers[dstClient].sz;
437 if (d_atomFixed[srcDevIdx][srcIndex]) {
440 fx = d_f_normal_x[srcDevIdx][srcIndex] +
441 d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
442 fy = d_f_normal_y[srcDevIdx][srcIndex] +
443 d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
444 fz = d_f_normal_z[srcDevIdx][srcIndex] +
445 d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
448 fx = d_f_normal_x[srcDevIdx][srcIndex] +
449 d_f_nbond_x[srcDevIdx][srcIndex] + d_f_slow_x[srcDevIdx][srcIndex];
450 fy = d_f_normal_y[srcDevIdx][srcIndex] +
451 d_f_nbond_y[srcDevIdx][srcIndex] + d_f_slow_y[srcDevIdx][srcIndex];
452 fz = d_f_normal_z[srcDevIdx][srcIndex] +
453 d_f_nbond_z[srcDevIdx][srcIndex] + d_f_slow_z[srcDevIdx][srcIndex];
455 d_clientBuffers[dstClient].d_data[dstIndex] = fx;
456 d_clientBuffers[dstClient].d_data[dstIndex + stride] = fy;
457 d_clientBuffers[dstClient].d_data[dstIndex + 2 * stride] = fz;
461 void copyTotalForcesToClientsCUDAMGPU(
462 bool fixedOn, const double **d_peer_f_normal_x,
463 const double **d_peer_f_normal_y, const double **d_peer_f_normal_z,
464 const double **d_peer_f_nbond_x, const double **d_peer_f_nbond_y,
465 const double **d_peer_f_nbond_z, const double **d_peer_f_slow_x,
466 const double **d_peer_f_slow_y, const double **d_peer_f_slow_z,
467 const int **d_peer_atomFixed,
468 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
469 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
470 size_t numClients, cudaStream_t stream) {
471 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
473 copyForcesToClientsKernelMGPU<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
474 d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
475 d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
476 d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
477 numCopyTuples, d_clientBuffers);
479 copyForcesToClientsKernelMGPU<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
480 d_peer_f_normal_x, d_peer_f_normal_y, d_peer_f_normal_z,
481 d_peer_f_nbond_x, d_peer_f_nbond_y, d_peer_f_nbond_z, d_peer_f_slow_x,
482 d_peer_f_slow_y, d_peer_f_slow_z, d_peer_atomFixed, d_copyList,
483 numCopyTuples, d_clientBuffers);
487 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
488 __global__ void addGlobalForcesFromClientsKernel(
489 double *__restrict d_f_global_x, double *__restrict d_f_global_y,
490 double *__restrict d_f_global_z, const int *__restrict d_atomFixed,
491 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
492 size_t numCopyTuples,
493 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
494 const int i = threadIdx.x + blockIdx.x * blockDim.x;
495 if (i < numCopyTuples) {
496 const int dstIndex = d_copyList[i].m_soa_index;
497 const int srcClient = d_copyList[i].m_client_index;
498 const int srcIndex = d_copyList[i].m_client_atom_pos;
499 const size_t stride = d_clientBuffers[srcClient].sz;
502 if (d_atomFixed[dstIndex]) {
505 fx = d_clientBuffers[srcClient].d_data[srcIndex];
506 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
507 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
510 fx = d_clientBuffers[srcClient].d_data[srcIndex];
511 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
512 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
515 d_f_global_x[dstIndex] += fx;
516 d_f_global_y[dstIndex] += fy;
517 d_f_global_z[dstIndex] += fz;
519 atomicAdd(&(d_f_global_x[dstIndex]), fx);
520 atomicAdd(&(d_f_global_y[dstIndex]), fy);
521 atomicAdd(&(d_f_global_z[dstIndex]), fz);
526 void addGlobalForcesFromClients(
527 const int fixedOn, const int uniqueAtoms,
528 double *d_f_global_x, double *d_f_global_y,
529 double *d_f_global_z, const int *__restrict d_atomFixed,
530 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
531 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
532 size_t numClients, cudaStream_t stream) {
533 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
534 const int options = (fixedOn << 1) + uniqueAtoms;
535 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
536 addGlobalForcesFromClientsKernel<FIXEDATOM, UNIQUE_ATOMS> \
537 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
538 d_f_global_x, d_f_global_y, d_f_global_z, d_atomFixed, d_copyList, \
539 numCopyTuples, d_clientBuffers)
541 case ((0<<1) + 0): CALL(0, 0); break;
542 case ((0<<1) + 1): CALL(0, 1); break;
543 case ((1<<1) + 0): CALL(1, 0); break;
544 case ((1<<1) + 1): CALL(1, 1); break;
546 const std::string error = "Error in addGlobalForcesFromClients. No kernel is called.\n";
547 NAMD_bug(error.c_str());
553 template <bool FIXEDATOM, bool UNIQUE_ATOMS>
554 __global__ void addGlobalForcesFromClientsKernelMGPU(
555 double **__restrict d_f_global_x, double **__restrict d_f_global_y,
556 double **__restrict d_f_global_z, const int **__restrict d_atomFixed,
557 const CudaGlobalMasterServer::CopyListTuple *__restrict d_copyList,
558 size_t numCopyTuples,
559 CudaGlobalMasterServer::ClientBuffer *__restrict d_clientBuffers) {
560 const int i = threadIdx.x + blockIdx.x * blockDim.x;
561 if (i < numCopyTuples) {
562 const int dstDev = d_copyList[i].m_src_dev_index;
563 const int dstIndex = d_copyList[i].m_soa_index;
564 const int srcClient = d_copyList[i].m_client_index;
565 const int srcIndex = d_copyList[i].m_client_atom_pos;
566 const size_t stride = d_clientBuffers[srcClient].sz;
569 if (d_atomFixed[dstIndex]) {
572 fx = d_clientBuffers[srcClient].d_data[srcIndex];
573 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
574 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
577 fx = d_clientBuffers[srcClient].d_data[srcIndex];
578 fy = d_clientBuffers[srcClient].d_data[srcIndex + stride];
579 fz = d_clientBuffers[srcClient].d_data[srcIndex + 2 * stride];
582 d_f_global_x[dstDev][dstIndex] += fx;
583 d_f_global_y[dstDev][dstIndex] += fy;
584 d_f_global_z[dstDev][dstIndex] += fz;
586 atomicAdd(&(d_f_global_x[dstDev][dstIndex]), fx);
587 atomicAdd(&(d_f_global_y[dstDev][dstIndex]), fy);
588 atomicAdd(&(d_f_global_z[dstDev][dstIndex]), fz);
593 void addGlobalForcesFromClientsMGPU(
594 const int fixedOn, const int uniqueAtoms,
595 double **d_peer_f_global_x, double **d_peer_f_global_y,
596 double **d_peer_f_global_z, const int **d_peer_atomFixed,
597 const CudaGlobalMasterServer::CopyListTuple *d_copyList,
598 size_t numCopyTuples, CudaGlobalMasterServer::ClientBuffer *d_clientBuffers,
599 size_t numClients, cudaStream_t stream) {
600 const int grid = (numCopyTuples + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
601 const int options = (fixedOn << 1) + uniqueAtoms;
602 #define CALL(FIXEDATOM, UNIQUE_ATOMS) \
603 addGlobalForcesFromClientsKernelMGPU<FIXEDATOM, UNIQUE_ATOMS>\
604 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
605 d_peer_f_global_x, d_peer_f_global_y, d_peer_f_global_z, \
606 d_peer_atomFixed, d_copyList, numCopyTuples, d_clientBuffers);
608 case ((0<<1) + 0): CALL(0, 0); break;
609 case ((0<<1) + 1): CALL(0, 1); break;
610 case ((1<<1) + 0): CALL(1, 0); break;
611 case ((1<<1) + 1): CALL(1, 1); break;
613 const std::string error = "Error in addGlobalForcesFromClientsMGPU. No kernel is called.\n";
614 NAMD_bug(error.c_str());
620 #endif // defined(NAMD_CUDA) && defined(NODEGROUP_FORCE_REGISTER)