NAMD
MigrationBondedCUDAKernel.cu
Go to the documentation of this file.
1 /**
2  * \file
3  * Tuple migration kernels and wrapper functions
4  */
5 
6 #ifdef NAMD_CUDA
7 #if __CUDACC_VER_MAJOR__ >= 11
8 #include <cub/cub.cuh>
9 #else
10 #include <namd_cub/cub.cuh>
11 #endif
12 #else // NAMD_HIP
13 #include <hip/hip_runtime.h>
14 #include <hipcub/hipcub.hpp>
15 #define cub hipcub
16 #endif // end NAMD_CUDA vs. NAMD_HIP
17 
18 #include "HipDefines.h"
19 
20 #include "MigrationBondedCUDAKernel.h"
21 #include "ComputeBondedCUDAKernel.h"
22 
23 #ifdef NODEGROUP_FORCE_REGISTER
24 
25 // Since only the downstream function from patch map is needed
26 // it is implemented separately
27 __device__ __forceinline__ int patchMap_downstream(
28  const int pid1, const int pid2, const int aDim, const int bDim,
29  const int cMaxIndex, const int bMaxIndex, const int aMaxIndex
30 ) {
31  int ds;
32  if (pid1 == pid2) {
33  ds = pid1;
34  } else if (pid1 == -1 || pid2 == -1) { // -1 means notUsed
35  ds = -1;
36  } else {
37  int3 data1, data2;
38  data1.x = pid1 % aDim;
39  data1.y = (pid1 / aDim) % bDim;
40  data1.z = pid1 / (aDim * bDim);
41 
42  data2.x = pid2 % aDim;
43  data2.y = (pid2 / aDim) % bDim;
44  data2.z = pid2 / (aDim * bDim);
45 
46  int k = data1.z;
47  int k2 = data2.z;
48  if ( ( k ? k : cMaxIndex ) == k2 + 1 ) k = k2;
49 
50  int j = data1.y;
51  int j2 = data2.y;
52  if ( ( j ? j : bMaxIndex ) == j2 + 1 ) j = j2;
53 
54  int i = data1.x;
55  int i2 = data2.x;
56  if ( ( i ? i : aMaxIndex ) == i2 + 1 ) i = i2;
57 
58  ds = ((k*bDim)+j)*aDim + i;
59  }
60  return ds;
61 }
62 
63 
64 /**
65  * \brief Computes the destination information for each tuple
66  *
67  * This kernel computes the destination device and patch for each tuple. It also
68  * converts the indexInDevice value stored in the "index" field to the local index
69  * of the atom in each patch via the migration destination values. It computes the
70  * downstream patch index on that patches home device.
71  *
72  * The destination device and downstream patch index are stored in separate buffers
73  * because they don't need to be migrated to new devices
74  *
75  * Since the tuple data is stored as AoS, we bulk load a batch into shared memory
76  * to use coalesced access
77  *
78  */
79 template<typename T>
80 __global__ void computeTupleDestinationKernel(
81  const int numTuples,
82  T* stages,
83  int* d_device,
84  int* d_downstreamPatchIndex,
85  const int4* migrationDestination,
86  const int* patchToGPU,
87  const int* patchIDtoHomePatchIndex,
88  const int aDim,
89  const int bDim,
90  const int cMaxIndex,
91  const int bMaxIndex,
92  const int aMaxIndex
93 ) {
94  using AccessType = int32_t;
95  __shared__ T s_in[MigrationBondedCUDAKernel::kNumThreads];
96  AccessType* s_in_int = (AccessType*) &(s_in);
97 
98  const int numTuplesBlock = ((numTuples + blockDim.x - 1) / blockDim.x) * blockDim.x;
99 
100  for (int i = blockDim.x * blockIdx.x; i < numTuplesBlock; i += blockDim.x * gridDim.x) {
101  AccessType* g_in_int = (AccessType*) (stages + i);
102  const int numBytesIn = sizeof(T) * blockDim.x;
103  for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
104  const int tupleIdx = i + (j / (sizeof(T) / sizeof(AccessType)));
105  if (tupleIdx < numTuples) {
106  s_in_int[j] = g_in_int[j];
107  }
108  }
109  __syncthreads();
110 
111  const int tupleIndex = i + threadIdx.x;
112  if (tupleIndex < numTuples) {
113  T elem = s_in[threadIdx.x];
114  const int tmp_pid = migrationDestination[elem.index[0]].x;
115  int downstream_patch = tmp_pid;
116  elem.patchIDs[0] = tmp_pid;
117  elem.index[0] = migrationDestination[elem.index[0]].w;
118 
119  for (int j = 1; j < T::size; j++) {
120  const int tmp_pid = migrationDestination[elem.index[j]].x;
121 
122  downstream_patch = patchMap_downstream(
123  downstream_patch, tmp_pid,
124  aDim, bDim, cMaxIndex, bMaxIndex, aMaxIndex
125  );
126  elem.patchIDs[j] = tmp_pid;
127  elem.index[j] = migrationDestination[elem.index[j]].w;
128  }
129 
130  s_in[threadIdx.x] = elem;
131  d_device[tupleIndex] = patchToGPU[downstream_patch];
132  d_downstreamPatchIndex[tupleIndex] = patchIDtoHomePatchIndex[downstream_patch];
133  }
134  __syncthreads();
135 
136  // Write tuples out
137  for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
138  const int tupleIdx = i + (j / (sizeof(T) / sizeof(AccessType)));
139  if (tupleIdx < numTuples) {
140  g_in_int[j] = s_in_int[j];
141  }
142  }
143  __syncthreads();
144  }
145 }
146 
147 /**
148  * \brief Computes the new index of non-migrating tuples
149  *
150  * Most tuples do not move into new patches, so we will compute their new index first
151  * using a scan operation. There can be a large number of tuples per patch; more than
152  * the number of atoms per patch. So we do the scan operation in batches while still
153  * using the CUB block API.
154  *
155  */
156 __global__ void reserveLocalDestinationKernel(
157  const int myDeviceIndex,
158  const int numPatchesHome,
159  const int* d_device,
160  const int* d_downstreamPatchIndex,
161  const int* d_offsets,
162  int* d_dstIndex,
163  int* d_counts
164 ) {
165  constexpr int kNumThreads = MigrationBondedCUDAKernel::kScanThreads;
166  constexpr int kItemsPerThread = MigrationBondedCUDAKernel::kScanItemsPerThread;
167  constexpr int kItemsPerIteration = kNumThreads * kItemsPerThread;
168 
169  typedef cub::BlockScan<int, kNumThreads> BlockScan;
170  __shared__ typename BlockScan::TempStorage temp_storage;
171 
172  for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
173  const int offset = d_offsets[patchIndex];
174  const int numTuples = d_offsets[patchIndex+1] - offset;
175 
176  const int numIterations = (numTuples + kItemsPerIteration - 1) / kItemsPerIteration;
177 
178  // We will do the scan in pieces in case there are too many tuples
179  int input_prefix = 0;
180  for (int iter = 0; iter < numIterations; iter++) {
181  int thread_input[kItemsPerThread];
182  int thread_output[kItemsPerThread];
183  int sum;
184 
185  // Load the batch into registers
186  for (int i = 0; i < kItemsPerThread; i++) {
187  const int idx = iter * kItemsPerIteration + kItemsPerThread * threadIdx.x + i;
188  if (idx < numTuples && d_downstreamPatchIndex[offset + idx] == patchIndex
189  && d_device[offset + idx] == myDeviceIndex) {
190  thread_input[i] = 1;
191  } else {
192  thread_input[i] = 0;
193  }
194  }
195 
196  // Compute the prefix sum
197  BlockScan(temp_storage).ExclusiveSum(thread_input, thread_output, sum);
198  __syncthreads();
199 
200  // Write the output
201  for (int i = 0; i < kItemsPerThread; i++) {
202  const int idx = iter * kItemsPerIteration + kItemsPerThread * threadIdx.x + i;
203  if (idx < numTuples && thread_input[i]) {
204  d_dstIndex[offset + idx] = thread_output[i] + input_prefix;
205  }
206  }
207  input_prefix += sum;
208  __syncthreads();
209  }
210  // We then need to set the count for this patch
211  if (threadIdx.x == 0) {
212  d_counts[patchIndex] = input_prefix;
213  }
214  __syncthreads();
215  }
216 }
217 
218 /**
219  * \brief Computes the destination for all tuples and computes the new index for local
220  *
221  * This will call the following kernels for each type type:
222  * - computeTupleDestinationKernel() \copybrief computeTupleDestinationKernel()
223  * - reserveLocalDestinationKernel() \copybrief reserveLocalDestinationKernel()
224  *
225  */
226 void MigrationBondedCUDAKernel::computeTupleDestination(
227  const int myDeviceIndex,
228  TupleCounts count,
229  const int numPatchesHome,
230  const int4* migrationDestination,
231  const int* patchIDtoGPU,
232  const int* patchIDtoHomePatchIndex,
233  const int aDim,
234  const int bDim,
235  const int cMaxIndex,
236  const int bMaxIndex,
237  const int aMaxIndex,
238  cudaStream_t stream
239 ) {
240  constexpr int numThreads = MigrationBondedCUDAKernel::kNumThreads;
241  int numBlocks;
242 
243  #define CALL(fieldName, typeName) do { \
244  numBlocks = (count.fieldName + numThreads - 1) / numThreads; \
245  if (numBlocks) { \
246  computeTupleDestinationKernel<typeName><<<numBlocks, numThreads, 0, stream>>>( \
247  count.fieldName, \
248  dataSrc.fieldName, \
249  d_device.fieldName(), \
250  d_downstreamPatchIndex.fieldName(), \
251  migrationDestination, \
252  patchIDtoGPU, \
253  patchIDtoHomePatchIndex, \
254  aDim, bDim, \
255  cMaxIndex, bMaxIndex, aMaxIndex \
256  ); \
257  } \
258  } while (0);
259 
260  CALL(bond, CudaBondStage);
261  CALL(angle, CudaAngleStage);
262  CALL(dihedral, CudaDihedralStage);
263  CALL(improper, CudaDihedralStage);
264  CALL(modifiedExclusion, CudaExclusionStage);
265  CALL(exclusion, CudaExclusionStage);
266  CALL(crossterm, CudaCrosstermStage);
267 
268  #undef CALL
269 
270  #define CALL(fieldName, typeName) do { \
271  numBlocks = numPatchesHome; \
272  reserveLocalDestinationKernel<<<numBlocks, kScanThreads, 0, stream>>>( \
273  myDeviceIndex, \
274  numPatchesHome, \
275  d_device.fieldName(), \
276  d_downstreamPatchIndex.fieldName(), \
277  d_offsets.fieldName(), \
278  d_dstIndex.fieldName(), \
279  d_counts.fieldName() \
280  ); \
281  } while (0);
282 
283  CALL(bond, CudaBondStage);
284  CALL(angle, CudaAngleStage);
285  CALL(dihedral, CudaDihedralStage);
286  CALL(improper, CudaDihedralStage);
287  CALL(modifiedExclusion, CudaExclusionStage);
288  CALL(exclusion, CudaExclusionStage);
289  CALL(crossterm, CudaCrosstermStage);
290 
291  #undef CALL
292 }
293 
294 /**
295  * \brief Computes the new index of migrating tuples
296  *
297  * Using atomic operations to compute the new index for tuples which have migrated
298  * into new patches.
299  *
300  */
301 __global__ void reserveTupleDestinationKernel(
302  const int myDeviceIndex,
303  const int numPatchesHome,
304  const int* d_device,
305  const int* d_downstreamPatchIndex,
306  const int* d_offsets,
307  int* d_dstIndex,
308  int** peer_counts
309 ) {
310  for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
311  const int offset = d_offsets[patchIndex];
312  const int numTuples = d_offsets[patchIndex+1] - offset;
313 
314  for (int i = threadIdx.x; i < numTuples; i += blockDim.x) {
315  const int downstreamPatchIndex = d_downstreamPatchIndex[offset + i];
316  const int device = d_device[offset + i];
317  if (downstreamPatchIndex != patchIndex || device != myDeviceIndex) {
318  const int dstIndex =
319 #if __CUDA_ARCH__ >= 600 || defined(NAMD_HIP)
320  atomicAdd_system(&(peer_counts[device][downstreamPatchIndex]), 1);
321 #else
322  // support single-GPU Maxwell
323  atomicAdd(&(peer_counts[device][downstreamPatchIndex]), 1);
324 #endif
325  d_dstIndex[offset + i] = dstIndex;
326  }
327  }
328  __syncthreads();
329  }
330 }
331 
332 /**
333  * \brief Computes the index of migrating tuples with atomics
334  *
335  * This will call the following kernels for each type type:
336  * - reserveTupleDestinationKernel() \copybrief reserveTupleDestinationKernel()
337  *
338  */
339 void MigrationBondedCUDAKernel::reserveTupleDestination(
340  const int myDeviceIndex,
341  const int numPatchesHome,
342  cudaStream_t stream
343 ) {
344  constexpr int numThreads = MigrationBondedCUDAKernel::kPatchThreads;
345  int numBlocks;
346 
347  #define CALL(fieldName, typeName) do { \
348  numBlocks = numPatchesHome; \
349  reserveTupleDestinationKernel<<<numBlocks, numThreads, 0, stream>>>( \
350  myDeviceIndex, \
351  numPatchesHome, \
352  d_device.fieldName(), \
353  d_downstreamPatchIndex.fieldName(), \
354  d_offsets.fieldName(), \
355  d_dstIndex.fieldName(), \
356  d_peer_counts.fieldName \
357  ); \
358  } while (0);
359 
360  CALL(bond, CudaBondStage);
361  CALL(angle, CudaAngleStage);
362  CALL(dihedral, CudaDihedralStage);
363  CALL(improper, CudaDihedralStage);
364  CALL(modifiedExclusion, CudaExclusionStage);
365  CALL(exclusion, CudaExclusionStage);
366  CALL(crossterm, CudaCrosstermStage);
367 
368  #undef CALL
369 }
370 
371 /**
372  * \brief Gathers the per-device tuple counts so they can be transfers together
373  *
374  */
375 __global__ void updateTotalCount(
376  const int numPatchesHome,
377  TupleIntArraysContiguous d_offsets,
378  TupleCounts* d_totalCounts
379 ) {
380  d_totalCounts[0].bond = d_offsets.bond()[numPatchesHome];
381  d_totalCounts[0].angle = d_offsets.angle()[numPatchesHome];
382  d_totalCounts[0].dihedral = d_offsets.dihedral()[numPatchesHome];
383  d_totalCounts[0].improper = d_offsets.improper()[numPatchesHome];
384  d_totalCounts[0].modifiedExclusion = d_offsets.modifiedExclusion()[numPatchesHome];
385  d_totalCounts[0].exclusion = d_offsets.exclusion()[numPatchesHome];
386 }
387 
388 /**
389  * \brief Computes the patch-level offsets the tuples
390  *
391  * This uses the block API because for most systems the number of patches does not
392  * motivate the use of the device API.
393  *
394  */
395 __global__ void threadBlockPatchScan(
396  const int numPatchesHome,
397  TupleIntArraysContiguous d_counts,
398  TupleIntArraysContiguous d_offsets,
399  TupleCounts* d_totalCounts
400 ) {
401  constexpr int kItemsPerThread = MigrationBondedCUDAKernel::kPatchItemsPerThread;
402 
403  typedef cub::BlockScan<int, MigrationBondedCUDAKernel::kPatchThreads> BlockScan;
404  __shared__ typename BlockScan::TempStorage temp_storage;
405 
406  int* input = d_counts.data + d_counts.offsets[blockIdx.x];
407  int* output = d_offsets.data + d_offsets.offsets[blockIdx.x];
408 
409  int r_input[kItemsPerThread];
410  int r_output[kItemsPerThread];
411 
412  for (int i = 0; i < kItemsPerThread; i++) {
413  const int idx = kItemsPerThread * threadIdx.x + i;
414  if (idx < numPatchesHome) {
415  r_input[i] = input[idx];
416  } else {
417  r_input[i] = 0;
418  }
419  }
420  __syncthreads();
421 
422  // Compute the prefix sum of solvent
423  int total;
424  BlockScan(temp_storage).InclusiveSum(r_input, r_output, total);
425  __syncthreads();
426 
427  for (int i = 0; i < kItemsPerThread; i++) {
428  const int idx = kItemsPerThread * threadIdx.x + i + 1;
429  if (idx < numPatchesHome+1) {
430  output[idx] = r_output[i];
431  }
432  }
433 
434  if (threadIdx.x == 0) {
435  int* int_totalCounts = (int*) d_totalCounts;
436  int_totalCounts[blockIdx.x] = total;
437  }
438 }
439 
440 /**
441  * \brief Computes the patch-level offsets the tuples
442  *
443  * Depending on the number of home patches we will either use the CUB device API
444  * or the CUB block API. The block API has less overhead since it is one in a single
445  * kernel whereas the device API launches 2 kernels per tuple type
446  *
447  * If the single kernel path is taken, it will call the following kernel:
448  * - threadBlockPatchScan() \copybrief threadBlockPatchScan()
449  *
450  */
451 void MigrationBondedCUDAKernel::computePatchOffsets(
452  const int numPatchesHome,
453  cudaStream_t stream
454 ) {
455  if (numPatchesHome > kMaxPatchesForSingleThreadBlock) {
456  #define CALL(fieldName) do { \
457  cub::DeviceScan::InclusiveSum( \
458  d_patchDeviceScan_scratch, temp, \
459  d_counts.fieldName(), d_offsets.fieldName()+1, numPatchesHome, stream \
460  ); \
461  } while (0);
462 
463  size_t temp = patchDeviceScan_alloc;
464  CALL(bond);
465  CALL(angle);
466  CALL(dihedral);
467  CALL(improper);
468  CALL(modifiedExclusion);
469  CALL(exclusion);
470  CALL(crossterm);
471 
472  #undef CALL
473 
474  updateTotalCount<<<1, 1, 0, stream>>>(numPatchesHome, d_offsets, d_totalCount);
475  } else {
476  constexpr int numThreads = kPatchThreads;
477  const int numBlocks = kNumTuplesTypes;
478 
479  threadBlockPatchScan<<<numBlocks, numThreads, 0, stream>>>(
480  numPatchesHome,
481  d_counts,
482  d_offsets,
483  d_totalCount
484  );
485  }
486 }
487 
488 /**
489  * \brief Performs actual tuple migration
490  *
491  * Based on previously computed tuple devices, patch index, patch offsets, and
492  * tuple index, does the actual tuple migrations. The tuples are stored as AoS so
493  * we try to use as coalesced memory access as possible. This does leave some
494  * threads unused.
495  *
496  * This copies the tuples into a scratch buffer
497  *
498  */
499 template<typename T>
500 __global__ void performTupleMigrationKernel(
501  const int numTuples,
502  const int* const* peer_offsets,
503  T* src,
504  T** dst,
505  int* d_device,
506  int* d_downstreamPatchIndex,
507  int* d_dstIndex
508 ) {
509  using AccessType = int32_t;
510  static_assert(sizeof(T) % sizeof(AccessType) == 0, "Tuples must be divisible by accesstype");
511  constexpr int kThreadsPerTuple = sizeof(T) / sizeof(AccessType);
512  constexpr int kTuplesPerWarp = WARPSIZE / kThreadsPerTuple;
513 
514  const int numTuplesWarp = ((numTuples + kTuplesPerWarp - 1) / kTuplesPerWarp);
515  const int numWarps = (blockDim.x * gridDim.x) / WARPSIZE;
516  const int wid_grid = (threadIdx.x + blockDim.x * blockIdx.x) / WARPSIZE;
517  const int lid = threadIdx.x % WARPSIZE;
518  const int indexInTuple = lid % kThreadsPerTuple;
519  const int tupleWarpIndex = lid / kThreadsPerTuple;
520 
521  for (int i = wid_grid; i < numTuplesWarp; i += numWarps) {
522  const int tupleIndex = i * kTuplesPerWarp + tupleWarpIndex;
523  const int active = (lid < kThreadsPerTuple * kTuplesPerWarp);
524 
525  if (active && tupleIndex < numTuples) {
526  const int device = d_device[tupleIndex];
527  const int index = d_dstIndex[tupleIndex];
528  const int patchIndex = d_downstreamPatchIndex[tupleIndex];
529 
530  // Hopefully this is cached to save us from lots of P2P accesses
531  const int offset = peer_offsets[device][patchIndex];
532 
533  AccessType* g_src_int = (AccessType*) &(src[tupleIndex]);
534  AccessType* g_dst_int = (AccessType*) &(dst[device][offset + index]);
535  g_dst_int[indexInTuple] = g_src_int[indexInTuple];
536  }
537 #if defined(NAMD_HIP)
538  NAMD_WARP_SYNC(WARP_FULL_MASK);
539 #else
540  WARP_SYNC(WARP_FULL_MASK);
541 #endif
542  }
543 }
544 
545 
546 /**
547  * \brief Moves tuples to their updated patches scratch buffers
548  *
549  * This will call the following kernels for each type type:
550  * - performTupleMigrationKernel() \copybrief performTupleMigrationKernel()
551  *
552  */
553 void MigrationBondedCUDAKernel::performTupleMigration(
554  TupleCounts count,
555  cudaStream_t stream
556 ) {
557  constexpr int numThreads = MigrationBondedCUDAKernel::kNumThreads;
558  int numBlocks;
559 
560  #define CALL(fieldName, typeName) do { \
561  numBlocks = (count.fieldName + numThreads - 1) / numThreads; \
562  if (numBlocks) { \
563  performTupleMigrationKernel<typeName><<<numBlocks, numThreads, 0, stream>>>( \
564  count.fieldName, \
565  d_peer_offsets.fieldName, \
566  dataSrc.fieldName, \
567  d_peer_data.fieldName, \
568  d_device.fieldName(), \
569  d_downstreamPatchIndex.fieldName(), \
570  d_dstIndex.fieldName() \
571  ); \
572  } \
573  } while (0);
574 
575  CALL(bond, CudaBondStage);
576  CALL(angle, CudaAngleStage);
577  CALL(dihedral, CudaDihedralStage);
578  CALL(improper, CudaDihedralStage);
579  CALL(modifiedExclusion, CudaExclusionStage);
580  CALL(exclusion, CudaExclusionStage);
581  CALL(crossterm, CudaCrosstermStage);
582 
583  #undef CALL
584 }
585 
586 
587 template <typename T_in, typename T_out>
588 __device__ __forceinline__
589 T_out convertTuple(
590  T_in& src,
591  const PatchRecord* patches,
592  const Lattice lattice,
593  const double3* __restrict__ patchMapCenter,
594  const float4* __restrict__ xyzq);
595 
596 
597 template <>
598 __device__ __forceinline__
599 CudaBond convertTuple(
600  CudaBondStage& src,
601  const PatchRecord* patches,
602  const Lattice lattice,
603  const double3* __restrict__ patchMapCenter,
604  const float4* __restrict__ xyzq
605 ) {
606  CudaBond dst;
607 
608  dst.itype = src.itype;
609  dst.scale = src.scale;
610  dst.fepBondedType = src.fepBondedType;
611 
612  dst.i = src.index[0];
613  dst.j = src.index[1];
614 
615  double3 center0 = patchMapCenter[src.patchIDs[0]];
616  double3 center1 = patchMapCenter[src.patchIDs[1]];
617  double3 diff = center0 - center1;
618 
619  double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
620  double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
621 
622  double3 shiftVec = lattice.wrap_delta_scaled(pos0, pos1);
623 
624  shiftVec = shiftVec + diff;
625 
626  dst.ioffsetXYZ = make_float3(shiftVec);
627 
628  return dst;
629 }
630 
631 template <>
632 __device__ __forceinline__
633 CudaAngle convertTuple(
634  CudaAngleStage& src,
635  const PatchRecord* patches,
636  const Lattice lattice,
637  const double3* __restrict__ patchMapCenter,
638  const float4* __restrict__ xyzq) {
639 
640  CudaAngle dst;
641 
642  dst.itype = src.itype;
643  dst.scale = src.scale;
644  dst.fepBondedType = src.fepBondedType;
645 
646  dst.i = src.index[0];
647  dst.j = src.index[1];
648  dst.k = src.index[2];
649 
650  double3 center0 = patchMapCenter[src.patchIDs[0]];
651  double3 center1 = patchMapCenter[src.patchIDs[1]];
652  double3 center2 = patchMapCenter[src.patchIDs[2]];
653 
654  double3 diff01 = center0 - center1;
655  double3 diff21 = center2 - center1;
656 
657  double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
658  double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
659  double3 pos2 = make_double3(xyzq[dst.k]) + (double3) lattice.unscale(center2);
660 
661  double3 shiftVec01 = lattice.wrap_delta_scaled(pos0, pos1);
662  double3 shiftVec21 = lattice.wrap_delta_scaled(pos2, pos1);
663  shiftVec01 = shiftVec01 + diff01;
664  shiftVec21 = shiftVec21 + diff21;
665 
666  dst.ioffsetXYZ = make_float3(shiftVec01);
667  dst.koffsetXYZ = make_float3(shiftVec21);
668 
669  return dst;
670 }
671 
672 template <>
673 __device__ __forceinline__
674 CudaDihedral convertTuple(
675  CudaDihedralStage& src,
676  const PatchRecord* patches,
677  const Lattice lattice,
678  const double3* __restrict__ patchMapCenter,
679  const float4* __restrict__ xyzq) {
680 
681  CudaDihedral dst;
682 
683  dst.itype = src.itype;
684  dst.scale = src.scale;
685  dst.fepBondedType = src.fepBondedType;
686 
687  dst.i = src.index[0];
688  dst.j = src.index[1];
689  dst.k = src.index[2];
690  dst.l = src.index[3];
691 
692  double3 center0 = patchMapCenter[src.patchIDs[0]];
693  double3 center1 = patchMapCenter[src.patchIDs[1]];
694  double3 center2 = patchMapCenter[src.patchIDs[2]];
695  double3 center3 = patchMapCenter[src.patchIDs[3]];
696 
697  double3 diff01 = center0 - center1;
698  double3 diff12 = center1 - center2;
699  double3 diff32 = center3 - center2;
700 
701  double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
702  double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
703  double3 pos2 = make_double3(xyzq[dst.k]) + (double3) lattice.unscale(center2);
704  double3 pos3 = make_double3(xyzq[dst.l]) + (double3) lattice.unscale(center3);
705 
706  double3 shiftVec01 = lattice.wrap_delta_scaled(pos0, pos1);
707  double3 shiftVec12 = lattice.wrap_delta_scaled(pos1, pos2);
708  double3 shiftVec32 = lattice.wrap_delta_scaled(pos3, pos2);
709  shiftVec01 = shiftVec01 + diff01;
710  shiftVec12 = shiftVec12 + diff12;
711  shiftVec32 = shiftVec32 + diff32;
712 
713  dst.ioffsetXYZ = make_float3(shiftVec01);
714  dst.joffsetXYZ = make_float3(shiftVec12);
715  dst.loffsetXYZ = make_float3(shiftVec32);
716 
717  return dst;
718 }
719 
720 template <>
721 __device__ __forceinline__
722 CudaExclusion convertTuple(
723  CudaExclusionStage& src,
724  const PatchRecord* patches,
725  const Lattice lattice,
726  const double3* __restrict__ patchMapCenter,
727  const float4* __restrict__ xyzq) {
728 
729  CudaExclusion dst;
730 
731  dst.vdwtypei = src.vdwtypei;
732  dst.vdwtypej = src.vdwtypej;
733  dst.pswitch = src.pswitch;
734 
735  dst.i = src.index[0];
736  dst.j = src.index[1];
737 
738  double3 center0 = patchMapCenter[src.patchIDs[0]];
739  double3 center1 = patchMapCenter[src.patchIDs[1]];
740 
741  double3 diff = center0 - center1;
742 
743  double3 pos0 = make_double3(xyzq[dst.i]) + (double3) lattice.unscale(center0);
744  double3 pos1 = make_double3(xyzq[dst.j]) + (double3) lattice.unscale(center1);
745 
746  double3 shiftVec = lattice.wrap_delta_scaled(pos0, pos1);
747  shiftVec = shiftVec + diff;
748 
749  dst.ioffsetXYZ = make_float3(shiftVec);
750 
751  return dst;
752 }
753 
754 template <>
755 __device__ __forceinline__
756 CudaCrossterm convertTuple(
757  CudaCrosstermStage& src,
758  const PatchRecord* patches,
759  const Lattice lattice,
760  const double3* __restrict__ patchMapCenter,
761  const float4* __restrict__ xyzq) {
762 
763  CudaCrossterm dst;
764 
765  dst.itype = src.itype;
766  dst.scale = src.scale;
767  dst.fepBondedType = src.fepBondedType;
768 
769  dst.i1 = src.index[0];
770  dst.i2 = src.index[1];
771  dst.i3 = src.index[2];
772  dst.i4 = src.index[3];
773  dst.i5 = src.index[4];
774  dst.i6 = src.index[5];
775  dst.i7 = src.index[6];
776  dst.i8 = src.index[7];
777 
778  double3 center0 = patchMapCenter[src.patchIDs[0]];
779  double3 center1 = patchMapCenter[src.patchIDs[1]];
780  double3 center2 = patchMapCenter[src.patchIDs[2]];
781  double3 center3 = patchMapCenter[src.patchIDs[3]];
782  double3 center4 = patchMapCenter[src.patchIDs[4]];
783  double3 center5 = patchMapCenter[src.patchIDs[5]];
784  double3 center6 = patchMapCenter[src.patchIDs[6]];
785  double3 center7 = patchMapCenter[src.patchIDs[7]];
786 
787  double3 diff01 = center0 - center1;
788  double3 diff12 = center1 - center2;
789  double3 diff23 = center2 - center3;
790  double3 diff45 = center4 - center5;
791  double3 diff56 = center5 - center6;
792  double3 diff67 = center6 - center7;
793 
794  double3 pos0 = make_double3(xyzq[dst.i1]) + (double3) lattice.unscale(center0);
795  double3 pos1 = make_double3(xyzq[dst.i2]) + (double3) lattice.unscale(center1);
796  double3 pos2 = make_double3(xyzq[dst.i3]) + (double3) lattice.unscale(center2);
797  double3 pos3 = make_double3(xyzq[dst.i4]) + (double3) lattice.unscale(center3);
798  double3 pos4 = make_double3(xyzq[dst.i5]) + (double3) lattice.unscale(center4);
799  double3 pos5 = make_double3(xyzq[dst.i6]) + (double3) lattice.unscale(center5);
800  double3 pos6 = make_double3(xyzq[dst.i7]) + (double3) lattice.unscale(center6);
801  double3 pos7 = make_double3(xyzq[dst.i8]) + (double3) lattice.unscale(center7);
802 
803  double3 shiftVec01 = lattice.wrap_delta_scaled(pos0, pos1);
804  double3 shiftVec12 = lattice.wrap_delta_scaled(pos1, pos2);
805  double3 shiftVec23 = lattice.wrap_delta_scaled(pos2, pos3);
806  double3 shiftVec45 = lattice.wrap_delta_scaled(pos4, pos5);
807  double3 shiftVec56 = lattice.wrap_delta_scaled(pos5, pos6);
808  double3 shiftVec67 = lattice.wrap_delta_scaled(pos6, pos7);
809 
810  shiftVec01 = shiftVec01 + diff01;
811  shiftVec12 = shiftVec12 + diff12;
812  shiftVec23 = shiftVec23 + diff23;
813  shiftVec45 = shiftVec45 + diff45;
814  shiftVec56 = shiftVec56 + diff56;
815  shiftVec67 = shiftVec67 + diff67;
816 
817  dst.offset12XYZ = make_float3(shiftVec01);
818  dst.offset23XYZ = make_float3(shiftVec12);
819  dst.offset34XYZ = make_float3(shiftVec23);
820  dst.offset56XYZ = make_float3(shiftVec45);
821  dst.offset67XYZ = make_float3(shiftVec56);
822  dst.offset78XYZ = make_float3(shiftVec67);
823 
824  return dst;
825 }
826 
827 
828 
829 template<typename T>
830 __device__ __forceinline__
831 void updateStageData(
832  T& tuple,
833  const PatchRecord* patches,
834  const int* ids
835 ) {
836  for (int i = 0; i < T::size; i++) {
837  // The patchID and ID are correct
838  const int patchID = tuple.patchIDs[i];
839 
840  PatchRecord record = patches[patchID]; // This structure is indexed by global patch ID
841  // And it contains the LOCAL offset...
842  const int offset = record.atomStart;
843  tuple.index[i] += offset;
844  }
845 }
846 
847 /**
848  * \brief Updates the tuple stage data and constructions the real tuple data
849  *
850  * Similar to the other kernels, this one will bulk read tuple data to coalesce
851  * memory access.
852  *
853  * The stage update is done by the updateStageData function, it will compute the
854  * new indexInDevice indces of each atom based on the atom's index within its patch
855  * which is currently stored in the `index` field and that patches buffer offset
856  * in the device's packed buffers.
857  *
858  * The convertTuple function converts the stage tuple to a real tuple.
859  *
860  */
861 template <typename T_in, typename T_out>
862 __global__ void updateTuplesKernel(
863  const int numTuples,
864  T_in* stages_src,
865  T_in* stages_dst,
866  T_out* dst,
867  const int* ids,
868  const PatchRecord* patches,
869  const double3* __restrict__ patchMapCenter,
870  const float4* __restrict__ xyzq,
871  const Lattice lattice
872 ) {
873  using AccessType = int32_t;
874 
875  __shared__ T_in s_in[MigrationBondedCUDAKernel::kNumThreads];
876  __shared__ T_out s_out[MigrationBondedCUDAKernel::kNumThreads];
877 
878  AccessType* s_in_int = (AccessType*) &(s_in);
879  AccessType* s_out_int = (AccessType*) &(s_out);
880 
881  const int numTuplesBlock = ((numTuples + blockDim.x - 1) / blockDim.x) * blockDim.x;
882 
883  for (int i = blockDim.x * blockIdx.x; i < numTuplesBlock; i += blockDim.x * gridDim.x) {
884  // Load in tuples to shared memory
885  AccessType* g_in_int = (AccessType*) (stages_src + i);
886  const int numBytesIn = sizeof(T_in) * blockDim.x;
887  for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
888  const int tupleIdx = i + (j / (sizeof(T_in) / sizeof(AccessType)));
889  if (tupleIdx < numTuples) {
890  s_in_int[j] = g_in_int[j];
891  }
892  }
893  __syncthreads();
894 
895  if (i + threadIdx.x < numTuples) {
896  T_in src = s_in[threadIdx.x];
897  updateStageData(src, patches, ids);
898  T_out r_out = convertTuple<T_in, T_out>(src, patches, lattice, patchMapCenter, xyzq);
899  s_out[threadIdx.x] = r_out;
900  s_in[threadIdx.x] = src;
901  }
902  __syncthreads();
903 
904  // Write tuples out
905  AccessType* g_out_int = (AccessType*) (stages_dst + i);
906  for (int j = threadIdx.x; j < numBytesIn / sizeof(AccessType); j += blockDim.x) {
907  const int tupleIdx = i + (j / (sizeof(T_in) / sizeof(AccessType)));
908  if (tupleIdx < numTuples) {
909  g_out_int[j] = s_in_int[j];
910  }
911  }
912 
913  AccessType* g_outData_int = (AccessType*) (dst + i);
914  const int numBytesOut = sizeof(T_out) * blockDim.x;
915  for (int j = threadIdx.x; j < numBytesOut / sizeof(AccessType); j += blockDim.x) {
916  const int tupleIdx = i + (j / (sizeof(T_out) / sizeof(AccessType)));
917  if (tupleIdx < numTuples) {
918  g_outData_int[j] = s_out_int[j];
919  }
920  }
921  __syncthreads();
922  }
923 }
924 
925 /**
926  * \brief Updates the tuples once they have been migrated
927  *
928  *
929  * This will call the following kernels for each type type:
930  * - updateTuplesKernel() \copybrief updateTuplesKernel()
931  *
932  */
933 void MigrationBondedCUDAKernel::updateTuples(
934  TupleCounts count,
935  TupleData data,
936  const int* ids,
937  const PatchRecord* patches,
938  const double3* d_patchMapCenter,
939  const float4* xyzq,
940  const Lattice lattice,
941  cudaStream_t stream
942 ) {
943  constexpr int numThreads = MigrationBondedCUDAKernel::kNumThreads;
944  int numBlocks;
945 
946  #define CALL(fieldName, typeNameIn, typeNameOut) do { \
947  numBlocks = (count.fieldName + numThreads - 1) / numThreads; \
948  if (numBlocks) { \
949  updateTuplesKernel<typeNameIn, typeNameOut><<<numBlocks, numThreads, 0, stream>>>( \
950  count.fieldName, \
951  dataDst.fieldName, \
952  dataSrc.fieldName, \
953  data.fieldName, \
954  ids, \
955  patches, \
956  d_patchMapCenter, \
957  xyzq, \
958  lattice \
959  ); \
960  } \
961  } while (0);
962 
963  CALL(bond, CudaBondStage, CudaBond);
964  CALL(angle, CudaAngleStage, CudaAngle);
965  CALL(dihedral, CudaDihedralStage, CudaDihedral);
966  CALL(improper, CudaDihedralStage, CudaDihedral);
967  CALL(modifiedExclusion, CudaExclusionStage, CudaExclusion);
968  CALL(exclusion, CudaExclusionStage, CudaExclusion);
969  CALL(crossterm, CudaCrosstermStage, CudaCrossterm);
970 
971  #undef CALL
972 }
973 
974 
975 void MigrationBondedCUDAKernel::copyTupleToDevice(
976  TupleCounts count,
977  const int numPatchesHome,
978  TupleDataStage h_dataStage,
979  TupleIntArrays h_counts,
980  TupleIntArrays h_offsets,
981  cudaStream_t stream
982 ) {
983  #define CALL(fieldName, typeName) do { \
984  copy_HtoD<typeName>(h_dataStage.fieldName, dataDst.fieldName, count.fieldName, stream); \
985  copy_HtoD<int>(h_counts.fieldName, d_counts.fieldName(), numPatchesHome, stream); \
986  copy_HtoD<int>(h_offsets.fieldName, d_offsets.fieldName(), numPatchesHome+1, stream); \
987  } while (0);
988 
989  CALL(bond, CudaBondStage);
990  CALL(angle, CudaAngleStage);
991  CALL(dihedral, CudaDihedralStage);
992  CALL(improper, CudaDihedralStage);
993  CALL(modifiedExclusion, CudaExclusionStage);
994  CALL(exclusion, CudaExclusionStage);
995  CALL(crossterm, CudaCrosstermStage);
996 
997  #undef CALL
998 }
999 
1000 
1001 bool MigrationBondedCUDAKernel::reallocateBufferSrc(TupleCounts counts) {
1002  bool out = false;
1003 
1004  constexpr float kTupleOveralloc = ComputeBondedCUDAKernel::kTupleOveralloc;
1005 
1006  #define CALL(fieldName, typeName) do { \
1007  const size_t paddedCount = ComputeBondedCUDAKernel::warpAlign(counts.fieldName); \
1008  if (paddedCount) { \
1009  out |= reallocate_device<typeName>(&(dataSrc.fieldName), &(srcAlloc.fieldName), \
1010  paddedCount, kTupleOveralloc); \
1011  } \
1012  } while (0);
1013 
1014  CALL(bond, CudaBondStage);
1015  CALL(angle, CudaAngleStage);
1016  CALL(dihedral, CudaDihedralStage);
1017  CALL(improper, CudaDihedralStage);
1018  CALL(modifiedExclusion, CudaExclusionStage);
1019  CALL(exclusion, CudaExclusionStage);
1020  CALL(crossterm, CudaCrosstermStage);
1021 
1022  #undef CALL
1023 
1024  const size_t totalSize = ComputeBondedCUDAKernel::warpAlign(counts.bond)
1025  + ComputeBondedCUDAKernel::warpAlign(counts.angle)
1026  + ComputeBondedCUDAKernel::warpAlign(counts.dihedral)
1027  + ComputeBondedCUDAKernel::warpAlign(counts.improper)
1028  + ComputeBondedCUDAKernel::warpAlign(counts.modifiedExclusion)
1029  + ComputeBondedCUDAKernel::warpAlign(counts.exclusion)
1030  + ComputeBondedCUDAKernel::warpAlign(counts.crossterm);
1031 
1032  size_t temp = srcTotalAlloc;
1033  reallocate_device<int>(&(d_device.data), &temp, totalSize, kTupleOveralloc);
1034  temp = srcTotalAlloc;
1035  reallocate_device<int>(&(d_downstreamPatchIndex.data), &temp, totalSize, kTupleOveralloc);
1036  temp = srcTotalAlloc;
1037  reallocate_device<int>(&(d_dstIndex.data), &temp, totalSize, kTupleOveralloc);
1038  srcTotalAlloc = temp;
1039 
1040  size_t offset = 0;
1041 
1042  #define CALL(fieldName, typeName, typeIndex) do { \
1043  d_device.offsets[typeIndex] = offset; \
1044  d_dstIndex.offsets[typeIndex] = offset; \
1045  d_downstreamPatchIndex.offsets[typeIndex] = offset; \
1046  offset += (ComputeBondedCUDAKernel::warpAlign(counts.fieldName)); \
1047  } while (0);
1048 
1049  CALL(bond, CudaBondStage, 0);
1050  CALL(angle, CudaAngleStage, 1);
1051  CALL(dihedral, CudaDihedralStage, 2);
1052  CALL(improper, CudaDihedralStage, 3);
1053  CALL(modifiedExclusion, CudaExclusionStage, 4);
1054  CALL(exclusion, CudaExclusionStage, 5);
1055  CALL(crossterm, CudaCrosstermStage, 6);
1056 
1057  #undef CALL
1058 
1059  return out;
1060 }
1061 
1062 
1063 bool MigrationBondedCUDAKernel::reallocateBufferDst(TupleCounts counts) {
1064  constexpr float kTupleOveralloc = ComputeBondedCUDAKernel::kTupleOveralloc;
1065  bool out = false;
1066 
1067  #define CALL(fieldName, typeName) do { \
1068  const size_t paddedCount = ComputeBondedCUDAKernel::warpAlign(counts.fieldName); \
1069  if (paddedCount) { \
1070  out |= reallocate_device<typeName>(&(dataDst.fieldName), &(dstAlloc.fieldName), \
1071  paddedCount, kTupleOveralloc); \
1072  } \
1073  } while (0);
1074 
1075  CALL(bond, CudaBondStage);
1076  CALL(angle, CudaAngleStage);
1077  CALL(dihedral, CudaDihedralStage);
1078  CALL(improper, CudaDihedralStage);
1079  CALL(modifiedExclusion, CudaExclusionStage);
1080  CALL(exclusion, CudaExclusionStage);
1081  CALL(crossterm, CudaCrosstermStage);
1082 
1083  #undef CALL
1084 
1085  return out;
1086 }
1087 
1088 
1089 MigrationBondedCUDAKernel::MigrationBondedCUDAKernel() {
1090  #define CALL(fieldName) do { \
1091  srcAlloc.fieldName = 0; \
1092  dstAlloc.fieldName = 0; \
1093  dataSrc.fieldName = NULL; \
1094  dataDst.fieldName = NULL; \
1095  d_peer_data.fieldName = NULL; \
1096  d_peer_counts.fieldName = NULL; \
1097  d_peer_offsets.fieldName = NULL; \
1098  } while (0);
1099 
1100  CALL(bond);
1101  CALL(angle);
1102  CALL(dihedral);
1103  CALL(improper);
1104  CALL(modifiedExclusion);
1105  CALL(exclusion);
1106  CALL(crossterm);
1107 
1108  #undef CALL
1109 
1110  srcTotalAlloc = 0;
1111  d_device.data = NULL;
1112  d_downstreamPatchIndex.data = NULL;
1113  d_dstIndex.data = NULL;
1114 
1115  d_counts.data = NULL;
1116  d_offsets.data = NULL;
1117 
1118  patchDeviceScan_alloc = 0;
1119  d_patchDeviceScan_scratch = NULL;
1120 
1121  allocate_device<TupleCounts>(&d_totalCount, 1);
1122  allocate_host<TupleCounts>(&h_totalCount, 1);
1123 }
1124 
1125 MigrationBondedCUDAKernel::~MigrationBondedCUDAKernel() {
1126  #define CALL(fieldName, typeName) do { \
1127  if (dataSrc.fieldName != NULL) deallocate_device<typeName>(&(dataSrc.fieldName)); \
1128  if (dataDst.fieldName != NULL) deallocate_device<typeName>(&(dataDst.fieldName)); \
1129  if (d_peer_data.fieldName != NULL) deallocate_device<typeName*>(&(d_peer_data.fieldName)); \
1130  if (d_peer_counts.fieldName != NULL) deallocate_device<int*>(&(d_peer_counts.fieldName)); \
1131  if (d_peer_offsets.fieldName != NULL) deallocate_device<int*>(&(d_peer_offsets.fieldName)); \
1132  } while (0);
1133 
1134  CALL(bond, CudaBondStage);
1135  CALL(angle, CudaAngleStage);
1136  CALL(dihedral, CudaDihedralStage);
1137  CALL(improper, CudaDihedralStage);
1138  CALL(modifiedExclusion, CudaExclusionStage);
1139  CALL(exclusion, CudaExclusionStage);
1140  CALL(crossterm, CudaCrosstermStage);
1141 
1142  #undef CALL
1143 
1144  if (d_counts.data != NULL) deallocate_device<int>(&d_counts.data);
1145  if (d_offsets.data != NULL) deallocate_device<int>(&d_offsets.data);
1146  if (d_patchDeviceScan_scratch != NULL) deallocate_device<char>(&d_patchDeviceScan_scratch);
1147 
1148  if (d_device.data != NULL) deallocate_device<int>(&d_device.data);
1149  if (d_downstreamPatchIndex.data != NULL) deallocate_device<int>(&d_downstreamPatchIndex.data);
1150  if (d_dstIndex.data != NULL) deallocate_device<int>(&d_dstIndex.data);
1151 
1152  deallocate_device<TupleCounts>(&d_totalCount);
1153  deallocate_host<TupleCounts>(&h_totalCount);
1154 }
1155 
1156 void MigrationBondedCUDAKernel::setup(const int numDevices, const int numPatchesHome) {
1157  //
1158  // Allocate data behind the patch counters. Allocating numPatchesHome + 1 for
1159  // offsets
1160  //
1161  numPatchesHomePad = ((numPatchesHome + 1 + kPatchNumberPad - 1)
1162  * kPatchNumberPad) / kPatchNumberPad;
1163 #if defined(NAMD_CUDA)
1164  allocate_device<int>(&(d_counts.data), numPatchesHomePad * kNumTuplesTypes);
1165  allocate_device<int>(&(d_offsets.data), numPatchesHomePad * kNumTuplesTypes);
1166 #else
1167  hipExtMallocWithFlags((void**) &(d_counts.data), sizeof(int)*numPatchesHomePad * kNumTuplesTypes, hipDeviceMallocFinegrained);
1168  hipExtMallocWithFlags((void**) &(d_offsets.data), sizeof(int)*numPatchesHomePad * kNumTuplesTypes, hipDeviceMallocFinegrained);
1169 #endif
1170  cudaMemset(d_offsets.data, 0, numPatchesHomePad * kNumTuplesTypes * sizeof(int));
1171 
1172 #if defined(NAMD_CUDA)
1173  #define CALL(fieldName, typeName, offset) do { \
1174  allocate_device<typeName*>(&(d_peer_data.fieldName), numDevices); \
1175  allocate_device<int*>(&(d_peer_counts.fieldName), numDevices); \
1176  allocate_device<int*>(&(d_peer_offsets.fieldName), numDevices); \
1177  d_counts.offsets[offset] = offset * numPatchesHomePad; \
1178  d_offsets.offsets[offset] = offset * numPatchesHomePad; \
1179  } while (0);
1180 
1181 #else
1182  #define CALL(fieldName, typeName, offset) do { \
1183  allocate_device<typeName*>(&(d_peer_data.fieldName), numDevices); \
1184  hipExtMallocWithFlags((void**) &(d_peer_counts.fieldName), sizeof(int*)*numDevices, hipDeviceMallocFinegrained); \
1185  allocate_device<int*>(&(d_peer_offsets.fieldName), numDevices); \
1186  d_counts.offsets[offset] = offset * numPatchesHomePad; \
1187  d_offsets.offsets[offset] = offset * numPatchesHomePad; \
1188  } while (0);
1189 
1190 #endif
1191 
1192  CALL(bond, CudaBondStage, 0);
1193  CALL(angle, CudaAngleStage, 1);
1194  CALL(dihedral, CudaDihedralStage, 2);
1195  CALL(improper, CudaDihedralStage, 3);
1196  CALL(modifiedExclusion, CudaExclusionStage, 4);
1197  CALL(exclusion, CudaExclusionStage, 5);
1198  CALL(crossterm, CudaCrosstermStage, 6);
1199 
1200  #undef CALL
1201 
1202 
1203  //
1204  // Allocate space for CUB Device functions
1205  //
1206  size_t temp_storage_bytes = 0;
1207  int* temp_ptr = NULL;
1208  cub::DeviceScan::InclusiveSum(
1209  NULL, temp_storage_bytes,
1210  temp_ptr, temp_ptr, numPatchesHome
1211  );
1212  patchDeviceScan_alloc = temp_storage_bytes;
1213  allocate_device<char>(&d_patchDeviceScan_scratch, patchDeviceScan_alloc);
1214 }
1215 
1216 void MigrationBondedCUDAKernel::clearTupleCounts(const int numPatchesHome, cudaStream_t stream) {
1217  cudaMemsetAsync(d_counts.data, 0, numPatchesHomePad * kNumTuplesTypes * sizeof(int), stream);
1218 }
1219 
1220 TupleCounts MigrationBondedCUDAKernel::fetchTupleCounts(const int numPatchesHome, cudaStream_t stream) {
1221 
1222  copy_DtoH<TupleCounts>(d_totalCount, h_totalCount, 1, stream); \
1223 
1224  cudaCheck(cudaStreamSynchronize(stream));
1225  TupleCounts newLocal;
1226 
1227  #define CALL(fieldName, typeName) do { \
1228  newLocal.fieldName = h_totalCount[0].fieldName; \
1229  } while (0);
1230 
1231  CALL(bond, CudaBondStage);
1232  CALL(angle, CudaAngleStage);
1233  CALL(dihedral, CudaDihedralStage);
1234  CALL(improper, CudaDihedralStage);
1235  CALL(modifiedExclusion, CudaExclusionStage);
1236  CALL(exclusion, CudaExclusionStage);
1237  CALL(crossterm, CudaCrosstermStage);
1238 
1239  #undef CALL
1240 
1241  return newLocal;
1242 }
1243 
1244 void MigrationBondedCUDAKernel::copyPeerDataToDevice(
1245  TupleDataStagePeer h_peer_data,
1246  TupleIntArraysPeer h_peer_counts,
1247  TupleIntArraysPeer h_peer_offsets,
1248  const int numDevices,
1249  cudaStream_t stream
1250 ) {
1251  #define CALL(fieldName, typeName) do { \
1252  copy_HtoD<typeName*>(h_peer_data.fieldName, d_peer_data.fieldName, numDevices, stream); \
1253  copy_HtoD<int*>(h_peer_counts.fieldName, d_peer_counts.fieldName, numDevices, stream); \
1254  copy_HtoD<int*>(h_peer_offsets.fieldName, d_peer_offsets.fieldName, numDevices, stream); \
1255  } while (0);
1256 
1257  CALL(bond, CudaBondStage);
1258  CALL(angle, CudaAngleStage);
1259  CALL(dihedral, CudaDihedralStage);
1260  CALL(improper, CudaDihedralStage);
1261  CALL(modifiedExclusion, CudaExclusionStage);
1262  CALL(exclusion, CudaExclusionStage);
1263  CALL(crossterm, CudaCrosstermStage);
1264 
1265  #undef CALL
1266 }
1267 
1268 #endif