3 * Tuple migration kernels and wrapper functions
7 #if __CUDACC_VER_MAJOR__ >= 11
10 #include <namd_cub/cub.cuh>
13 #include <hip/hip_runtime.h>
14 #include <hipcub/hipcub.hpp>
16 #endif // end NAMD_CUDA vs. NAMD_HIP
18 #include "HipDefines.h"
20 #include "MigrationBondedCUDAKernel.h"
21 #include "ComputeBondedCUDAKernel.h"
23 #ifdef NODEGROUP_FORCE_REGISTER
25 // Since only the downstream function from patch map is needed
26 // it is implemented separately
27 __device__ __forceinline__ int patchMap_downstream(
28 const int pid1, const int pid2, const int aDim, const int bDim,
29 const int cMaxIndex, const int bMaxIndex, const int aMaxIndex
34 } else if (pid1 == -1 || pid2 == -1) { // -1 means notUsed
38 data1.x = pid1 % aDim;
39 data1.y = (pid1 / aDim) % bDim;
40 data1.z = pid1 / (aDim * bDim);
42 data2.x = pid2 % aDim;
43 data2.y = (pid2 / aDim) % bDim;
44 data2.z = pid2 / (aDim * bDim);
48 if ( ( k ? k : cMaxIndex ) == k2 + 1 ) k = k2;
52 if ( ( j ? j : bMaxIndex ) == j2 + 1 ) j = j2;
56 if ( ( i ? i : aMaxIndex ) == i2 + 1 ) i = i2;
58 ds = ((k*bDim)+j)*aDim + i;
65 * \brief Computes the destination information for each tuple
67 * This kernel computes the destination device and patch for each tuple. It also
68 * converts the indexInDevice value stored in the "index" field to the local index
69 * of the atom in each patch via the migration destination values. It computes the
70 * downstream patch index on that patches home device.
72 * The destination device and downstream patch index are stored in separate buffers
73 * because they don't need to be migrated to new devices
75 * Since the tuple data is stored as AoS, we bulk load a batch into shared memory
76 * to use coalesced access
80 __global__ void computeTupleDestinationKernel(
84 int* d_downstreamPatchIndex,
85 const int4* migrationDestination,
86 const int* patchToGPU,
87 const int* patchIDtoHomePatchIndex,
94 using AccessType = int32_t;
95 __shared__ T s_in[MigrationBondedCUDAKernel::kNumThreads];
96 AccessType* s_in_int = (AccessType*) &(s_in);
98 const int numTuplesBlock = ((numTuples + blockDim.x - 1) / blockDim.x) * blockDim.x;
100 for (int i = blockDim.x * blockIdx.x; i < numTuplesBlock; i += blockDim.x * gridDim.x) {
101 AccessType* g_in_int = (AccessType*) (stages + i);
102 const int numBytesIn = sizeof(T) * blockDim.x;
103 for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
104 const int tupleIdx = i + (j / (sizeof(T) / sizeof(AccessType)));
105 if (tupleIdx < numTuples) {
106 s_in_int[j] = g_in_int[j];
111 const int tupleIndex = i + threadIdx.x;
112 if (tupleIndex < numTuples) {
113 T elem = s_in[threadIdx.x];
114 const int tmp_pid = migrationDestination[elem.index[0]].x;
115 int downstream_patch = tmp_pid;
116 elem.patchIDs[0] = tmp_pid;
117 elem.index[0] = migrationDestination[elem.index[0]].w;
119 for (int j = 1; j < T::size; j++) {
120 const int tmp_pid = migrationDestination[elem.index[j]].x;
122 downstream_patch = patchMap_downstream(
123 downstream_patch, tmp_pid,
124 aDim, bDim, cMaxIndex, bMaxIndex, aMaxIndex
126 elem.patchIDs[j] = tmp_pid;
127 elem.index[j] = migrationDestination[elem.index[j]].w;
130 s_in[threadIdx.x] = elem;
131 d_device[tupleIndex] = patchToGPU[downstream_patch];
132 d_downstreamPatchIndex[tupleIndex] = patchIDtoHomePatchIndex[downstream_patch];
137 for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
138 const int tupleIdx = i + (j / (sizeof(T) / sizeof(AccessType)));
139 if (tupleIdx < numTuples) {
140 g_in_int[j] = s_in_int[j];
148 * \brief Computes the new index of non-migrating tuples
150 * Most tuples do not move into new patches, so we will compute their new index first
151 * using a scan operation. There can be a large number of tuples per patch; more than
152 * the number of atoms per patch. So we do the scan operation in batches while still
153 * using the CUB block API.
156 __global__ void reserveLocalDestinationKernel(
157 const int myDeviceIndex,
158 const int numPatchesHome,
160 const int* d_downstreamPatchIndex,
161 const int* d_offsets,
165 constexpr int kNumThreads = MigrationBondedCUDAKernel::kScanThreads;
166 constexpr int kItemsPerThread = MigrationBondedCUDAKernel::kScanItemsPerThread;
167 constexpr int kItemsPerIteration = kNumThreads * kItemsPerThread;
169 typedef cub::BlockScan<int, kNumThreads> BlockScan;
170 __shared__ typename BlockScan::TempStorage temp_storage;
172 for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
173 const int offset = d_offsets[patchIndex];
174 const int numTuples = d_offsets[patchIndex+1] - offset;
176 const int numIterations = (numTuples + kItemsPerIteration - 1) / kItemsPerIteration;
178 // We will do the scan in pieces in case there are too many tuples
179 int input_prefix = 0;
180 for (int iter = 0; iter < numIterations; iter++) {
181 int thread_input[kItemsPerThread];
182 int thread_output[kItemsPerThread];
185 // Load the batch into registers
186 for (int i = 0; i < kItemsPerThread; i++) {
187 const int idx = iter * kItemsPerIteration + kItemsPerThread * threadIdx.x + i;
188 if (idx < numTuples && d_downstreamPatchIndex[offset + idx] == patchIndex
189 && d_device[offset + idx] == myDeviceIndex) {
196 // Compute the prefix sum
197 BlockScan(temp_storage).ExclusiveSum(thread_input, thread_output, sum);
201 for (int i = 0; i < kItemsPerThread; i++) {
202 const int idx = iter * kItemsPerIteration + kItemsPerThread * threadIdx.x + i;
203 if (idx < numTuples && thread_input[i]) {
204 d_dstIndex[offset + idx] = thread_output[i] + input_prefix;
210 // We then need to set the count for this patch
211 if (threadIdx.x == 0) {
212 d_counts[patchIndex] = input_prefix;
219 * \brief Computes the destination for all tuples and computes the new index for local
221 * This will call the following kernels for each type type:
222 * - computeTupleDestinationKernel() \copybrief computeTupleDestinationKernel()
223 * - reserveLocalDestinationKernel() \copybrief reserveLocalDestinationKernel()
226 void MigrationBondedCUDAKernel::computeTupleDestination(
227 const int myDeviceIndex,
229 const int numPatchesHome,
230 const int4* migrationDestination,
231 const int* patchIDtoGPU,
232 const int* patchIDtoHomePatchIndex,
240 constexpr int numThreads = MigrationBondedCUDAKernel::kNumThreads;
243 #define CALL(fieldName, typeName) do { \
244 numBlocks = (count.fieldName + numThreads - 1) / numThreads; \
246 computeTupleDestinationKernel<typeName><<<numBlocks, numThreads, 0, stream>>>( \
249 d_device.fieldName(), \
250 d_downstreamPatchIndex.fieldName(), \
251 migrationDestination, \
253 patchIDtoHomePatchIndex, \
255 cMaxIndex, bMaxIndex, aMaxIndex \
260 CALL(bond, CudaBondStage);
261 CALL(angle, CudaAngleStage);
262 CALL(dihedral, CudaDihedralStage);
263 CALL(improper, CudaDihedralStage);
264 CALL(modifiedExclusion, CudaExclusionStage);
265 CALL(exclusion, CudaExclusionStage);
266 CALL(crossterm, CudaCrosstermStage);
270 #define CALL(fieldName, typeName) do { \
271 numBlocks = numPatchesHome; \
272 reserveLocalDestinationKernel<<<numBlocks, kScanThreads, 0, stream>>>( \
275 d_device.fieldName(), \
276 d_downstreamPatchIndex.fieldName(), \
277 d_offsets.fieldName(), \
278 d_dstIndex.fieldName(), \
279 d_counts.fieldName() \
283 CALL(bond, CudaBondStage);
284 CALL(angle, CudaAngleStage);
285 CALL(dihedral, CudaDihedralStage);
286 CALL(improper, CudaDihedralStage);
287 CALL(modifiedExclusion, CudaExclusionStage);
288 CALL(exclusion, CudaExclusionStage);
289 CALL(crossterm, CudaCrosstermStage);
295 * \brief Computes the new index of migrating tuples
297 * Using atomic operations to compute the new index for tuples which have migrated
301 __global__ void reserveTupleDestinationKernel(
302 const int myDeviceIndex,
303 const int numPatchesHome,
305 const int* d_downstreamPatchIndex,
306 const int* d_offsets,
310 for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
311 const int offset = d_offsets[patchIndex];
312 const int numTuples = d_offsets[patchIndex+1] - offset;
314 for (int i = threadIdx.x; i < numTuples; i += blockDim.x) {
315 const int downstreamPatchIndex = d_downstreamPatchIndex[offset + i];
316 const int device = d_device[offset + i];
317 if (downstreamPatchIndex != patchIndex || device != myDeviceIndex) {
319 #if __CUDA_ARCH__ >= 600 || defined(NAMD_HIP)
320 atomicAdd_system(&(peer_counts[device][downstreamPatchIndex]), 1);
322 // support single-GPU Maxwell
323 atomicAdd(&(peer_counts[device][downstreamPatchIndex]), 1);
325 d_dstIndex[offset + i] = dstIndex;
333 * \brief Computes the index of migrating tuples with atomics
335 * This will call the following kernels for each type type:
336 * - reserveTupleDestinationKernel() \copybrief reserveTupleDestinationKernel()
339 void MigrationBondedCUDAKernel::reserveTupleDestination(
340 const int myDeviceIndex,
341 const int numPatchesHome,
344 constexpr int numThreads = MigrationBondedCUDAKernel::kPatchThreads;
347 #define CALL(fieldName, typeName) do { \
348 numBlocks = numPatchesHome; \
349 reserveTupleDestinationKernel<<<numBlocks, numThreads, 0, stream>>>( \
352 d_device.fieldName(), \
353 d_downstreamPatchIndex.fieldName(), \
354 d_offsets.fieldName(), \
355 d_dstIndex.fieldName(), \
356 d_peer_counts.fieldName \
360 CALL(bond, CudaBondStage);
361 CALL(angle, CudaAngleStage);
362 CALL(dihedral, CudaDihedralStage);
363 CALL(improper, CudaDihedralStage);
364 CALL(modifiedExclusion, CudaExclusionStage);
365 CALL(exclusion, CudaExclusionStage);
366 CALL(crossterm, CudaCrosstermStage);
372 * \brief Gathers the per-device tuple counts so they can be transfers together
375 __global__ void updateTotalCount(
376 const int numPatchesHome,
377 TupleIntArraysContiguous d_offsets,
378 TupleCounts* d_totalCounts
380 d_totalCounts[0].bond = d_offsets.bond()[numPatchesHome];
381 d_totalCounts[0].angle = d_offsets.angle()[numPatchesHome];
382 d_totalCounts[0].dihedral = d_offsets.dihedral()[numPatchesHome];
383 d_totalCounts[0].improper = d_offsets.improper()[numPatchesHome];
384 d_totalCounts[0].modifiedExclusion = d_offsets.modifiedExclusion()[numPatchesHome];
385 d_totalCounts[0].exclusion = d_offsets.exclusion()[numPatchesHome];
389 * \brief Computes the patch-level offsets the tuples
391 * This uses the block API because for most systems the number of patches does not
392 * motivate the use of the device API.
395 __global__ void threadBlockPatchScan(
396 const int numPatchesHome,
397 TupleIntArraysContiguous d_counts,
398 TupleIntArraysContiguous d_offsets,
399 TupleCounts* d_totalCounts
401 constexpr int kItemsPerThread = MigrationBondedCUDAKernel::kPatchItemsPerThread;
403 typedef cub::BlockScan<int, MigrationBondedCUDAKernel::kPatchThreads> BlockScan;
404 __shared__ typename BlockScan::TempStorage temp_storage;
406 int* input = d_counts.data + d_counts.offsets[blockIdx.x];
407 int* output = d_offsets.data + d_offsets.offsets[blockIdx.x];
409 int r_input[kItemsPerThread];
410 int r_output[kItemsPerThread];
412 for (int i = 0; i < kItemsPerThread; i++) {
413 const int idx = kItemsPerThread * threadIdx.x + i;
414 if (idx < numPatchesHome) {
415 r_input[i] = input[idx];
422 // Compute the prefix sum of solvent
424 BlockScan(temp_storage).InclusiveSum(r_input, r_output, total);
427 for (int i = 0; i < kItemsPerThread; i++) {
428 const int idx = kItemsPerThread * threadIdx.x + i + 1;
429 if (idx < numPatchesHome+1) {
430 output[idx] = r_output[i];
434 if (threadIdx.x == 0) {
435 int* int_totalCounts = (int*) d_totalCounts;
436 int_totalCounts[blockIdx.x] = total;
441 * \brief Computes the patch-level offsets the tuples
443 * Depending on the number of home patches we will either use the CUB device API
444 * or the CUB block API. The block API has less overhead since it is one in a single
445 * kernel whereas the device API launches 2 kernels per tuple type
447 * If the single kernel path is taken, it will call the following kernel:
448 * - threadBlockPatchScan() \copybrief threadBlockPatchScan()
451 void MigrationBondedCUDAKernel::computePatchOffsets(
452 const int numPatchesHome,
455 if (numPatchesHome > kMaxPatchesForSingleThreadBlock) {
456 #define CALL(fieldName) do { \
457 cub::DeviceScan::InclusiveSum( \
458 d_patchDeviceScan_scratch, temp, \
459 d_counts.fieldName(), d_offsets.fieldName()+1, numPatchesHome, stream \
463 size_t temp = patchDeviceScan_alloc;
468 CALL(modifiedExclusion);
474 updateTotalCount<<<1, 1, 0, stream>>>(numPatchesHome, d_offsets, d_totalCount);
476 constexpr int numThreads = kPatchThreads;
477 const int numBlocks = kNumTuplesTypes;
479 threadBlockPatchScan<<<numBlocks, numThreads, 0, stream>>>(
489 * \brief Performs actual tuple migration
491 * Based on previously computed tuple devices, patch index, patch offsets, and
492 * tuple index, does the actual tuple migrations. The tuples are stored as AoS so
493 * we try to use as coalesced memory access as possible. This does leave some
496 * This copies the tuples into a scratch buffer
500 __global__ void performTupleMigrationKernel(
502 const int* const* peer_offsets,
506 int* d_downstreamPatchIndex,
509 using AccessType = int32_t;
510 static_assert(sizeof(T) % sizeof(AccessType) == 0, "Tuples must be divisible by accesstype");
511 constexpr int kThreadsPerTuple = sizeof(T) / sizeof(AccessType);
512 constexpr int kTuplesPerWarp = WARPSIZE / kThreadsPerTuple;
514 const int numTuplesWarp = ((numTuples + kTuplesPerWarp - 1) / kTuplesPerWarp);
515 const int numWarps = (blockDim.x * gridDim.x) / WARPSIZE;
516 const int wid_grid = (threadIdx.x + blockDim.x * blockIdx.x) / WARPSIZE;
517 const int lid = threadIdx.x % WARPSIZE;
518 const int indexInTuple = lid % kThreadsPerTuple;
519 const int tupleWarpIndex = lid / kThreadsPerTuple;
521 for (int i = wid_grid; i < numTuplesWarp; i += numWarps) {
522 const int tupleIndex = i * kTuplesPerWarp + tupleWarpIndex;
523 const int active = (lid < kThreadsPerTuple * kTuplesPerWarp);
525 if (active && tupleIndex < numTuples) {
526 const int device = d_device[tupleIndex];
527 const int index = d_dstIndex[tupleIndex];
528 const int patchIndex = d_downstreamPatchIndex[tupleIndex];
530 // Hopefully this is cached to save us from lots of P2P accesses
531 const int offset = peer_offsets[device][patchIndex];
533 AccessType* g_src_int = (AccessType*) &(src[tupleIndex]);
534 AccessType* g_dst_int = (AccessType*) &(dst[device][offset + index]);
535 g_dst_int[indexInTuple] = g_src_int[indexInTuple];
537 #if defined(NAMD_HIP)
538 NAMD_WARP_SYNC(WARP_FULL_MASK);
540 WARP_SYNC(WARP_FULL_MASK);
547 * \brief Moves tuples to their updated patches scratch buffers
549 * This will call the following kernels for each type type:
550 * - performTupleMigrationKernel() \copybrief performTupleMigrationKernel()
553 void MigrationBondedCUDAKernel::performTupleMigration(
557 constexpr int numThreads = MigrationBondedCUDAKernel::kNumThreads;
560 #define CALL(fieldName, typeName) do { \
561 numBlocks = (count.fieldName + numThreads - 1) / numThreads; \
563 performTupleMigrationKernel<typeName><<<numBlocks, numThreads, 0, stream>>>( \
565 d_peer_offsets.fieldName, \
567 d_peer_data.fieldName, \
568 d_device.fieldName(), \
569 d_downstreamPatchIndex.fieldName(), \
570 d_dstIndex.fieldName() \
575 CALL(bond, CudaBondStage);
576 CALL(angle, CudaAngleStage);
577 CALL(dihedral, CudaDihedralStage);
578 CALL(improper, CudaDihedralStage);
579 CALL(modifiedExclusion, CudaExclusionStage);
580 CALL(exclusion, CudaExclusionStage);
581 CALL(crossterm, CudaCrosstermStage);
587 template <typename T_in, typename T_out>
588 __device__ __forceinline__
591 const PatchRecord* patches,
592 const Lattice lattice,
593 const double3* __restrict__ patchMapCenter,
594 const float4* __restrict__ xyzq);
598 __device__ __forceinline__
599 CudaBond convertTuple(
601 const PatchRecord* patches,
602 const Lattice lattice,
603 const double3* __restrict__ patchMapCenter,
604 const float4* __restrict__ xyzq
608 dst.itype = src.itype;
609 dst.scale = src.scale;
610 dst.fepBondedType = src.fepBondedType;
612 dst.i = src.index[0];
613 dst.j = src.index[1];
615 double3 center0 = patchMapCenter[src.patchIDs[0]];
616 double3 center1 = patchMapCenter[src.patchIDs[1]];
617 double3 diff = center0 - center1;
619 double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
620 double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
622 double3 shiftVec = lattice.wrap_delta_scaled(pos0, pos1);
624 shiftVec = shiftVec + diff;
626 dst.ioffsetXYZ = make_float3(shiftVec);
632 __device__ __forceinline__
633 CudaAngle convertTuple(
635 const PatchRecord* patches,
636 const Lattice lattice,
637 const double3* __restrict__ patchMapCenter,
638 const float4* __restrict__ xyzq) {
642 dst.itype = src.itype;
643 dst.scale = src.scale;
644 dst.fepBondedType = src.fepBondedType;
646 dst.i = src.index[0];
647 dst.j = src.index[1];
648 dst.k = src.index[2];
650 double3 center0 = patchMapCenter[src.patchIDs[0]];
651 double3 center1 = patchMapCenter[src.patchIDs[1]];
652 double3 center2 = patchMapCenter[src.patchIDs[2]];
654 double3 diff01 = center0 - center1;
655 double3 diff21 = center2 - center1;
657 double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
658 double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
659 double3 pos2 = make_double3(xyzq[dst.k]) + (double3) lattice.unscale(center2);
661 double3 shiftVec01 = lattice.wrap_delta_scaled(pos0, pos1);
662 double3 shiftVec21 = lattice.wrap_delta_scaled(pos2, pos1);
663 shiftVec01 = shiftVec01 + diff01;
664 shiftVec21 = shiftVec21 + diff21;
666 dst.ioffsetXYZ = make_float3(shiftVec01);
667 dst.koffsetXYZ = make_float3(shiftVec21);
673 __device__ __forceinline__
674 CudaDihedral convertTuple(
675 CudaDihedralStage& src,
676 const PatchRecord* patches,
677 const Lattice lattice,
678 const double3* __restrict__ patchMapCenter,
679 const float4* __restrict__ xyzq) {
683 dst.itype = src.itype;
684 dst.scale = src.scale;
685 dst.fepBondedType = src.fepBondedType;
687 dst.i = src.index[0];
688 dst.j = src.index[1];
689 dst.k = src.index[2];
690 dst.l = src.index[3];
692 double3 center0 = patchMapCenter[src.patchIDs[0]];
693 double3 center1 = patchMapCenter[src.patchIDs[1]];
694 double3 center2 = patchMapCenter[src.patchIDs[2]];
695 double3 center3 = patchMapCenter[src.patchIDs[3]];
697 double3 diff01 = center0 - center1;
698 double3 diff12 = center1 - center2;
699 double3 diff32 = center3 - center2;
701 double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
702 double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
703 double3 pos2 = make_double3(xyzq[dst.k]) + (double3) lattice.unscale(center2);
704 double3 pos3 = make_double3(xyzq[dst.l]) + (double3) lattice.unscale(center3);
706 double3 shiftVec01 = lattice.wrap_delta_scaled(pos0, pos1);
707 double3 shiftVec12 = lattice.wrap_delta_scaled(pos1, pos2);
708 double3 shiftVec32 = lattice.wrap_delta_scaled(pos3, pos2);
709 shiftVec01 = shiftVec01 + diff01;
710 shiftVec12 = shiftVec12 + diff12;
711 shiftVec32 = shiftVec32 + diff32;
713 dst.ioffsetXYZ = make_float3(shiftVec01);
714 dst.joffsetXYZ = make_float3(shiftVec12);
715 dst.loffsetXYZ = make_float3(shiftVec32);
721 __device__ __forceinline__
722 CudaExclusion convertTuple(
723 CudaExclusionStage& src,
724 const PatchRecord* patches,
725 const Lattice lattice,
726 const double3* __restrict__ patchMapCenter,
727 const float4* __restrict__ xyzq) {
731 dst.vdwtypei = src.vdwtypei;
732 dst.vdwtypej = src.vdwtypej;
733 dst.pswitch = src.pswitch;
735 dst.i = src.index[0];
736 dst.j = src.index[1];
738 double3 center0 = patchMapCenter[src.patchIDs[0]];
739 double3 center1 = patchMapCenter[src.patchIDs[1]];
741 double3 diff = center0 - center1;
743 double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
744 double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
746 double3 shiftVec = lattice.wrap_delta_scaled(pos0, pos1);
747 shiftVec = shiftVec + diff;
749 dst.ioffsetXYZ = make_float3(shiftVec);
755 __device__ __forceinline__
756 CudaCrossterm convertTuple(
757 CudaCrosstermStage& src,
758 const PatchRecord* patches,
759 const Lattice lattice,
760 const double3* __restrict__ patchMapCenter,
761 const float4* __restrict__ xyzq) {
765 dst.itype = src.itype;
766 dst.scale = src.scale;
767 dst.fepBondedType = src.fepBondedType;
769 dst.i1 = src.index[0];
770 dst.i2 = src.index[1];
771 dst.i3 = src.index[2];
772 dst.i4 = src.index[3];
773 dst.i5 = src.index[4];
774 dst.i6 = src.index[5];
775 dst.i7 = src.index[6];
776 dst.i8 = src.index[7];
778 double3 center0 = patchMapCenter[src.patchIDs[0]];
779 double3 center1 = patchMapCenter[src.patchIDs[1]];
780 double3 center2 = patchMapCenter[src.patchIDs[2]];
781 double3 center3 = patchMapCenter[src.patchIDs[3]];
782 double3 center4 = patchMapCenter[src.patchIDs[4]];
783 double3 center5 = patchMapCenter[src.patchIDs[5]];
784 double3 center6 = patchMapCenter[src.patchIDs[6]];
785 double3 center7 = patchMapCenter[src.patchIDs[7]];
787 double3 diff01 = center0 - center1;
788 double3 diff12 = center1 - center2;
789 double3 diff23 = center2 - center3;
790 double3 diff45 = center4 - center5;
791 double3 diff56 = center5 - center6;
792 double3 diff67 = center6 - center7;
794 double3 pos0 = make_double3(xyzq[dst.i1]) + (double3) lattice.unscale(center0);
795 double3 pos1 = make_double3(xyzq[dst.i2]) + (double3) lattice.unscale(center1);
796 double3 pos2 = make_double3(xyzq[dst.i3]) + (double3) lattice.unscale(center2);
797 double3 pos3 = make_double3(xyzq[dst.i4]) + (double3) lattice.unscale(center3);
798 double3 pos4 = make_double3(xyzq[dst.i5]) + (double3) lattice.unscale(center4);
799 double3 pos5 = make_double3(xyzq[dst.i6]) + (double3) lattice.unscale(center5);
800 double3 pos6 = make_double3(xyzq[dst.i7]) + (double3) lattice.unscale(center6);
801 double3 pos7 = make_double3(xyzq[dst.i8]) + (double3) lattice.unscale(center7);
803 double3 shiftVec01 = lattice.wrap_delta_scaled(pos0, pos1);
804 double3 shiftVec12 = lattice.wrap_delta_scaled(pos1, pos2);
805 double3 shiftVec23 = lattice.wrap_delta_scaled(pos2, pos3);
806 double3 shiftVec45 = lattice.wrap_delta_scaled(pos4, pos5);
807 double3 shiftVec56 = lattice.wrap_delta_scaled(pos5, pos6);
808 double3 shiftVec67 = lattice.wrap_delta_scaled(pos6, pos7);
810 shiftVec01 = shiftVec01 + diff01;
811 shiftVec12 = shiftVec12 + diff12;
812 shiftVec23 = shiftVec23 + diff23;
813 shiftVec45 = shiftVec45 + diff45;
814 shiftVec56 = shiftVec56 + diff56;
815 shiftVec67 = shiftVec67 + diff67;
817 dst.offset12XYZ = make_float3(shiftVec01);
818 dst.offset23XYZ = make_float3(shiftVec12);
819 dst.offset34XYZ = make_float3(shiftVec23);
820 dst.offset56XYZ = make_float3(shiftVec45);
821 dst.offset67XYZ = make_float3(shiftVec56);
822 dst.offset78XYZ = make_float3(shiftVec67);
830 __device__ __forceinline__
831 void updateStageData(
833 const PatchRecord* patches,
836 for (int i = 0; i < T::size; i++) {
837 // The patchID and ID are correct
838 const int patchID = tuple.patchIDs[i];
840 PatchRecord record = patches[patchID]; // This structure is indexed by global patch ID
841 // And it contains the LOCAL offset...
842 const int offset = record.atomStart;
843 tuple.index[i] += offset;
848 * \brief Updates the tuple stage data and constructions the real tuple data
850 * Similar to the other kernels, this one will bulk read tuple data to coalesce
853 * The stage update is done by the updateStageData function, it will compute the
854 * new indexInDevice indces of each atom based on the atom's index within its patch
855 * which is currently stored in the `index` field and that patches buffer offset
856 * in the device's packed buffers.
858 * The convertTuple function converts the stage tuple to a real tuple.
861 template <typename T_in, typename T_out>
862 __global__ void updateTuplesKernel(
868 const PatchRecord* patches,
869 const double3* __restrict__ patchMapCenter,
870 const float4* __restrict__ xyzq,
871 const Lattice lattice
873 using AccessType = int32_t;
875 __shared__ T_in s_in[MigrationBondedCUDAKernel::kNumThreads];
876 __shared__ T_out s_out[MigrationBondedCUDAKernel::kNumThreads];
878 AccessType* s_in_int = (AccessType*) &(s_in);
879 AccessType* s_out_int = (AccessType*) &(s_out);
881 const int numTuplesBlock = ((numTuples + blockDim.x - 1) / blockDim.x) * blockDim.x;
883 for (int i = blockDim.x * blockIdx.x; i < numTuplesBlock; i += blockDim.x * gridDim.x) {
884 // Load in tuples to shared memory
885 AccessType* g_in_int = (AccessType*) (stages_src + i);
886 const int numBytesIn = sizeof(T_in) * blockDim.x;
887 for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
888 const int tupleIdx = i + (j / (sizeof(T_in) / sizeof(AccessType)));
889 if (tupleIdx < numTuples) {
890 s_in_int[j] = g_in_int[j];
895 if (i + threadIdx.x < numTuples) {
896 T_in src = s_in[threadIdx.x];
897 updateStageData(src, patches, ids);
898 T_out r_out = convertTuple<T_in, T_out>(src, patches, lattice, patchMapCenter, xyzq);
899 s_out[threadIdx.x] = r_out;
900 s_in[threadIdx.x] = src;
905 AccessType* g_out_int = (AccessType*) (stages_dst + i);
906 for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
907 const int tupleIdx = i + (j / (sizeof(T_in) / sizeof(AccessType)));
908 if (tupleIdx < numTuples) {
909 g_out_int[j] = s_in_int[j];
913 AccessType* g_outData_int = (AccessType*) (dst + i);
914 const int numBytesOut = sizeof(T_out) * blockDim.x;
915 for (int j = threadIdx.x; j < numBytesOut / sizeof(AccessType); j += blockDim.x) {
916 const int tupleIdx = i + (j / (sizeof(T_out) / sizeof(AccessType)));
917 if (tupleIdx < numTuples) {
918 g_outData_int[j] = s_out_int[j];
926 * \brief Updates the tuples once they have been migrated
929 * This will call the following kernels for each type type:
930 * - updateTuplesKernel() \copybrief updateTuplesKernel()
933 void MigrationBondedCUDAKernel::updateTuples(
937 const PatchRecord* patches,
938 const double3* d_patchMapCenter,
940 const Lattice lattice,
943 constexpr int numThreads = MigrationBondedCUDAKernel::kNumThreads;
946 #define CALL(fieldName, typeNameIn, typeNameOut) do { \
947 numBlocks = (count.fieldName + numThreads - 1) / numThreads; \
949 updateTuplesKernel<typeNameIn, typeNameOut><<<numBlocks, numThreads, 0, stream>>>( \
963 CALL(bond, CudaBondStage, CudaBond);
964 CALL(angle, CudaAngleStage, CudaAngle);
965 CALL(dihedral, CudaDihedralStage, CudaDihedral);
966 CALL(improper, CudaDihedralStage, CudaDihedral);
967 CALL(modifiedExclusion, CudaExclusionStage, CudaExclusion);
968 CALL(exclusion, CudaExclusionStage, CudaExclusion);
969 CALL(crossterm, CudaCrosstermStage, CudaCrossterm);
975 void MigrationBondedCUDAKernel::copyTupleToDevice(
977 const int numPatchesHome,
978 TupleDataStage h_dataStage,
979 TupleIntArrays h_counts,
980 TupleIntArrays h_offsets,
983 #define CALL(fieldName, typeName) do { \
984 copy_HtoD<typeName>(h_dataStage.fieldName, dataDst.fieldName, count.fieldName, stream); \
985 copy_HtoD<int>(h_counts.fieldName, d_counts.fieldName(), numPatchesHome, stream); \
986 copy_HtoD<int>(h_offsets.fieldName, d_offsets.fieldName(), numPatchesHome+1, stream); \
989 CALL(bond, CudaBondStage);
990 CALL(angle, CudaAngleStage);
991 CALL(dihedral, CudaDihedralStage);
992 CALL(improper, CudaDihedralStage);
993 CALL(modifiedExclusion, CudaExclusionStage);
994 CALL(exclusion, CudaExclusionStage);
995 CALL(crossterm, CudaCrosstermStage);
1001 bool MigrationBondedCUDAKernel::reallocateBufferSrc(TupleCounts counts) {
1004 constexpr float kTupleOveralloc = ComputeBondedCUDAKernel::kTupleOveralloc;
1006 #define CALL(fieldName, typeName) do { \
1007 const size_t paddedCount = ComputeBondedCUDAKernel::warpAlign(counts.fieldName); \
1008 if (paddedCount) { \
1009 out |= reallocate_device<typeName>(&(dataSrc.fieldName), &(srcAlloc.fieldName), \
1010 paddedCount, kTupleOveralloc); \
1014 CALL(bond, CudaBondStage);
1015 CALL(angle, CudaAngleStage);
1016 CALL(dihedral, CudaDihedralStage);
1017 CALL(improper, CudaDihedralStage);
1018 CALL(modifiedExclusion, CudaExclusionStage);
1019 CALL(exclusion, CudaExclusionStage);
1020 CALL(crossterm, CudaCrosstermStage);
1024 const size_t totalSize = ComputeBondedCUDAKernel::warpAlign(counts.bond)
1025 + ComputeBondedCUDAKernel::warpAlign(counts.angle)
1026 + ComputeBondedCUDAKernel::warpAlign(counts.dihedral)
1027 + ComputeBondedCUDAKernel::warpAlign(counts.improper)
1028 + ComputeBondedCUDAKernel::warpAlign(counts.modifiedExclusion)
1029 + ComputeBondedCUDAKernel::warpAlign(counts.exclusion)
1030 + ComputeBondedCUDAKernel::warpAlign(counts.crossterm);
1032 size_t temp = srcTotalAlloc;
1033 reallocate_device<int>(&(d_device.data), &temp, totalSize, kTupleOveralloc);
1034 temp = srcTotalAlloc;
1035 reallocate_device<int>(&(d_downstreamPatchIndex.data), &temp, totalSize, kTupleOveralloc);
1036 temp = srcTotalAlloc;
1037 reallocate_device<int>(&(d_dstIndex.data), &temp, totalSize, kTupleOveralloc);
1038 srcTotalAlloc = temp;
1042 #define CALL(fieldName, typeName, typeIndex) do { \
1043 d_device.offsets[typeIndex] = offset; \
1044 d_dstIndex.offsets[typeIndex] = offset; \
1045 d_downstreamPatchIndex.offsets[typeIndex] = offset; \
1046 offset += (ComputeBondedCUDAKernel::warpAlign(counts.fieldName)); \
1049 CALL(bond, CudaBondStage, 0);
1050 CALL(angle, CudaAngleStage, 1);
1051 CALL(dihedral, CudaDihedralStage, 2);
1052 CALL(improper, CudaDihedralStage, 3);
1053 CALL(modifiedExclusion, CudaExclusionStage, 4);
1054 CALL(exclusion, CudaExclusionStage, 5);
1055 CALL(crossterm, CudaCrosstermStage, 6);
1063 bool MigrationBondedCUDAKernel::reallocateBufferDst(TupleCounts counts) {
1064 constexpr float kTupleOveralloc = ComputeBondedCUDAKernel::kTupleOveralloc;
1067 #define CALL(fieldName, typeName) do { \
1068 const size_t paddedCount = ComputeBondedCUDAKernel::warpAlign(counts.fieldName); \
1069 if (paddedCount) { \
1070 out |= reallocate_device<typeName>(&(dataDst.fieldName), &(dstAlloc.fieldName), \
1071 paddedCount, kTupleOveralloc); \
1075 CALL(bond, CudaBondStage);
1076 CALL(angle, CudaAngleStage);
1077 CALL(dihedral, CudaDihedralStage);
1078 CALL(improper, CudaDihedralStage);
1079 CALL(modifiedExclusion, CudaExclusionStage);
1080 CALL(exclusion, CudaExclusionStage);
1081 CALL(crossterm, CudaCrosstermStage);
1089 MigrationBondedCUDAKernel::MigrationBondedCUDAKernel() {
1090 #define CALL(fieldName) do { \
1091 srcAlloc.fieldName = 0; \
1092 dstAlloc.fieldName = 0; \
1093 dataSrc.fieldName = NULL; \
1094 dataDst.fieldName = NULL; \
1095 d_peer_data.fieldName = NULL; \
1096 d_peer_counts.fieldName = NULL; \
1097 d_peer_offsets.fieldName = NULL; \
1104 CALL(modifiedExclusion);
1111 d_device.data = NULL;
1112 d_downstreamPatchIndex.data = NULL;
1113 d_dstIndex.data = NULL;
1115 d_counts.data = NULL;
1116 d_offsets.data = NULL;
1118 patchDeviceScan_alloc = 0;
1119 d_patchDeviceScan_scratch = NULL;
1121 allocate_device<TupleCounts>(&d_totalCount, 1);
1122 allocate_host<TupleCounts>(&h_totalCount, 1);
1125 MigrationBondedCUDAKernel::~MigrationBondedCUDAKernel() {
1126 #define CALL(fieldName, typeName) do { \
1127 if (dataSrc.fieldName != NULL) deallocate_device<typeName>(&(dataSrc.fieldName)); \
1128 if (dataDst.fieldName != NULL) deallocate_device<typeName>(&(dataDst.fieldName)); \
1129 if (d_peer_data.fieldName != NULL) deallocate_device<typeName*>(&(d_peer_data.fieldName)); \
1130 if (d_peer_counts.fieldName != NULL) deallocate_device<int*>(&(d_peer_counts.fieldName)); \
1131 if (d_peer_offsets.fieldName != NULL) deallocate_device<int*>(&(d_peer_offsets.fieldName)); \
1134 CALL(bond, CudaBondStage);
1135 CALL(angle, CudaAngleStage);
1136 CALL(dihedral, CudaDihedralStage);
1137 CALL(improper, CudaDihedralStage);
1138 CALL(modifiedExclusion, CudaExclusionStage);
1139 CALL(exclusion, CudaExclusionStage);
1140 CALL(crossterm, CudaCrosstermStage);
1144 if (d_counts.data != NULL) deallocate_device<int>(&d_counts.data);
1145 if (d_offsets.data != NULL) deallocate_device<int>(&d_offsets.data);
1146 if (d_patchDeviceScan_scratch != NULL) deallocate_device<char>(&d_patchDeviceScan_scratch);
1148 if (d_device.data != NULL) deallocate_device<int>(&d_device.data);
1149 if (d_downstreamPatchIndex.data != NULL) deallocate_device<int>(&d_downstreamPatchIndex.data);
1150 if (d_dstIndex.data != NULL) deallocate_device<int>(&d_dstIndex.data);
1152 deallocate_device<TupleCounts>(&d_totalCount);
1153 deallocate_host<TupleCounts>(&h_totalCount);
1156 void MigrationBondedCUDAKernel::setup(const int numDevices, const int numPatchesHome) {
1158 // Allocate data behind the patch counters. Allocating numPatchesHome + 1 for
1161 numPatchesHomePad = ((numPatchesHome + 1 + kPatchNumberPad - 1)
1162 * kPatchNumberPad) / kPatchNumberPad;
1163 #if defined(NAMD_CUDA)
1164 allocate_device<int>(&(d_counts.data), numPatchesHomePad * kNumTuplesTypes);
1165 allocate_device<int>(&(d_offsets.data), numPatchesHomePad * kNumTuplesTypes);
1167 hipExtMallocWithFlags((void**) &(d_counts.data), sizeof(int)*numPatchesHomePad * kNumTuplesTypes, hipDeviceMallocFinegrained);
1168 hipExtMallocWithFlags((void**) &(d_offsets.data), sizeof(int)*numPatchesHomePad * kNumTuplesTypes, hipDeviceMallocFinegrained);
1170 cudaMemset(d_offsets.data, 0, numPatchesHomePad * kNumTuplesTypes * sizeof(int));
1172 #if defined(NAMD_CUDA)
1173 #define CALL(fieldName, typeName, offset) do { \
1174 allocate_device<typeName*>(&(d_peer_data.fieldName), numDevices); \
1175 allocate_device<int*>(&(d_peer_counts.fieldName), numDevices); \
1176 allocate_device<int*>(&(d_peer_offsets.fieldName), numDevices); \
1177 d_counts.offsets[offset] = offset * numPatchesHomePad; \
1178 d_offsets.offsets[offset] = offset * numPatchesHomePad; \
1182 #define CALL(fieldName, typeName, offset) do { \
1183 allocate_device<typeName*>(&(d_peer_data.fieldName), numDevices); \
1184 hipExtMallocWithFlags((void**) &(d_peer_counts.fieldName), sizeof(int*)*numDevices, hipDeviceMallocFinegrained); \
1185 allocate_device<int*>(&(d_peer_offsets.fieldName), numDevices); \
1186 d_counts.offsets[offset] = offset * numPatchesHomePad; \
1187 d_offsets.offsets[offset] = offset * numPatchesHomePad; \
1192 CALL(bond, CudaBondStage, 0);
1193 CALL(angle, CudaAngleStage, 1);
1194 CALL(dihedral, CudaDihedralStage, 2);
1195 CALL(improper, CudaDihedralStage, 3);
1196 CALL(modifiedExclusion, CudaExclusionStage, 4);
1197 CALL(exclusion, CudaExclusionStage, 5);
1198 CALL(crossterm, CudaCrosstermStage, 6);
1204 // Allocate space for CUB Device functions
1206 size_t temp_storage_bytes = 0;
1207 int* temp_ptr = NULL;
1208 cub::DeviceScan::InclusiveSum(
1209 NULL, temp_storage_bytes,
1210 temp_ptr, temp_ptr, numPatchesHome
1212 patchDeviceScan_alloc = temp_storage_bytes;
1213 allocate_device<char>(&d_patchDeviceScan_scratch, patchDeviceScan_alloc);
1216 void MigrationBondedCUDAKernel::clearTupleCounts(const int numPatchesHome, cudaStream_t stream) {
1217 cudaMemsetAsync(d_counts.data, 0, numPatchesHomePad * kNumTuplesTypes * sizeof(int), stream);
1220 TupleCounts MigrationBondedCUDAKernel::fetchTupleCounts(const int numPatchesHome, cudaStream_t stream) {
1222 copy_DtoH<TupleCounts>(d_totalCount, h_totalCount, 1, stream); \
1224 cudaCheck(cudaStreamSynchronize(stream));
1225 TupleCounts newLocal;
1227 #define CALL(fieldName, typeName) do { \
1228 newLocal.fieldName = h_totalCount[0].fieldName; \
1231 CALL(bond, CudaBondStage);
1232 CALL(angle, CudaAngleStage);
1233 CALL(dihedral, CudaDihedralStage);
1234 CALL(improper, CudaDihedralStage);
1235 CALL(modifiedExclusion, CudaExclusionStage);
1236 CALL(exclusion, CudaExclusionStage);
1237 CALL(crossterm, CudaCrosstermStage);
1244 void MigrationBondedCUDAKernel::copyPeerDataToDevice(
1245 TupleDataStagePeer h_peer_data,
1246 TupleIntArraysPeer h_peer_counts,
1247 TupleIntArraysPeer h_peer_offsets,
1248 const int numDevices,
1251 #define CALL(fieldName, typeName) do { \
1252 copy_HtoD<typeName*>(h_peer_data.fieldName, d_peer_data.fieldName, numDevices, stream); \
1253 copy_HtoD<int*>(h_peer_counts.fieldName, d_peer_counts.fieldName, numDevices, stream); \
1254 copy_HtoD<int*>(h_peer_offsets.fieldName, d_peer_offsets.fieldName, numDevices, stream); \
1257 CALL(bond, CudaBondStage);
1258 CALL(angle, CudaAngleStage);
1259 CALL(dihedral, CudaDihedralStage);
1260 CALL(improper, CudaDihedralStage);
1261 CALL(modifiedExclusion, CudaExclusionStage);
1262 CALL(exclusion, CudaExclusionStage);
1263 CALL(crossterm, CudaCrosstermStage);