2 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <namd_cub/cub.cuh>
8 #include <hip/hip_runtime.h>
9 #include <hipcub/hipcub.hpp>
13 #include "HipDefines.h"
15 #include "MigrationCUDAKernel.h"
16 #include "CudaComputeNonbondedKernel.h"
17 #include "CudaComputeNonbondedKernel.hip.h"
19 #ifdef NODEGROUP_FORCE_REGISTER
21 #define MAX_VALUE 2147483647
22 #define BIG_FLOAT 1e20
23 #define SMALL_FLOAT -1e20
25 __device__ __forceinline__ void singleBisect(
30 const double* current_dim,
32 unsigned int* s_indexBuffer,
33 unsigned int (&thread_start)[MigrationCUDAKernel::kValuesPerThread],
34 unsigned int (&thread_end)[MigrationCUDAKernel::kValuesPerThread],
35 unsigned int (&thread_out)[MigrationCUDAKernel::kValuesPerThread],
36 unsigned int (&thread_keys)[MigrationCUDAKernel::kValuesPerThread],
37 unsigned int (&thread_values)[MigrationCUDAKernel::kValuesPerThread]
39 constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
40 typedef cub::BlockRadixSort<unsigned int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, unsigned int> BlockRadixSort;
41 __shared__ typename BlockRadixSort::TempStorage temp_sort;
44 for (int i = 0; i < kValuesPerThread; i++) {
45 const int idx = kValuesPerThread * threadIdx.x + i;
47 const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
48 thread_keys[i] = thread_out[i] | pos_norm;
49 thread_values[i] = idx;
52 thread_values[i] = ~0;
57 BlockRadixSort(temp_sort).Sort(thread_keys, thread_values);
60 for (int i = 0; i < kValuesPerThread; i++) {
61 const int idx = kValuesPerThread * threadIdx.x + i;
62 if (thread_values[i] < numAtoms) {
63 s_indexBuffer[thread_values[i]] = idx;
69 for (int i = 0; i < kValuesPerThread; i++) {
70 const int idx = kValuesPerThread * threadIdx.x + i;
72 const int newIndex = s_indexBuffer[idx];
74 int split_factor = 32;
77 if ( thread_start[i]/split_factor < (thread_end[i]-1)/split_factor ) {
78 split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
81 if (split_factor == 1){
82 split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
86 if (newIndex >= split) {
87 thread_out[i] |= 1 << (30 - bit_pos);
88 thread_start[i] = split;
90 thread_end[i] = split;
92 if (bit_pos == bit_total - 1) {
93 const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
94 thread_out[i] |= pos_norm;
102 __global__ void computeBisect(
103 const int numPatches,
105 const CudaLocalRecord* localRecords,
106 const double3* patchMin,
107 const double3* patchMax,
114 constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
116 __shared__ CudaLocalRecord s_record;
117 using AccessType = int32_t;
118 AccessType* s_record_buffer = (AccessType*) &s_record;
120 typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
121 __shared__ typename BlockReduce::TempStorage temp_storage;
123 __shared__ double3 s_pmin;
124 __shared__ double3 s_pmax;
127 __shared__ unsigned int s_indexBuffer[MigrationCUDAKernel::kMaxAtomsPerPatch];
129 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
131 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
132 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
136 const int numAtoms = s_record.numAtoms;
137 const int offset = s_record.bufferOffset;
139 // Compute the thread-local min/max values
144 pmax.x = SMALL_FLOAT;
145 pmax.y = SMALL_FLOAT;
146 pmax.z = SMALL_FLOAT;
147 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
148 pmin.x = fmin(pmin.x, pos_x[offset + i]);
149 pmin.y = fmin(pmin.y, pos_y[offset + i]);
150 pmin.z = fmin(pmin.z, pos_z[offset + i]);
151 pmax.x = fmax(pmax.x, pos_x[offset + i]);
152 pmax.y = fmax(pmax.y, pos_y[offset + i]);
153 pmax.z = fmax(pmax.z, pos_z[offset + i]);
157 // Compute the thread-block min/max values
158 pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, cub::Min());
160 pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, cub::Min());
162 pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, cub::Min());
165 pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, cub::Max());
167 pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, cub::Max());
169 pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, cub::Max());
172 if (threadIdx.x == 0) {
182 unsigned int thread_start[kValuesPerThread];
183 unsigned int thread_end[kValuesPerThread];
184 unsigned int thread_out[kValuesPerThread];
185 unsigned int thread_keys[kValuesPerThread];
186 unsigned int thread_values[kValuesPerThread];
188 for (int i = 0; i < kValuesPerThread; i++) {
191 thread_end[i] = numAtoms;
194 double diff_x = pmax.x - pmin.x;
195 double diff_y = pmax.y - pmin.y;
196 double diff_z = pmax.z - pmin.z;
198 const int num_iters = 3;
199 for (int i = 0; i < num_iters; i++) {
200 if (diff_x > diff_y && diff_y > diff_z) {
201 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
202 thread_start, thread_end, thread_out, thread_keys, thread_values);
203 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
204 thread_start, thread_end, thread_out, thread_keys, thread_values);
205 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
206 thread_start, thread_end, thread_out, thread_keys, thread_values);
207 } else if (diff_x > diff_z && diff_z > diff_y) {
208 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
209 thread_start, thread_end, thread_out, thread_keys, thread_values);
210 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
211 thread_start, thread_end, thread_out, thread_keys, thread_values);
212 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
213 thread_start, thread_end, thread_out, thread_keys, thread_values);
214 } else if (diff_y > diff_x && diff_x > diff_z) {
215 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
216 thread_start, thread_end, thread_out, thread_keys, thread_values);
217 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
218 thread_start, thread_end, thread_out, thread_keys, thread_values);
219 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
220 thread_start, thread_end, thread_out, thread_keys, thread_values);
221 } else if (diff_y > diff_z && diff_z > diff_x) {
222 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
223 thread_start, thread_end, thread_out, thread_keys, thread_values);
224 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
225 thread_start, thread_end, thread_out, thread_keys, thread_values);
226 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
227 thread_start, thread_end, thread_out, thread_keys, thread_values);
228 } else if (diff_z > diff_x && diff_x > diff_y) {
229 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
230 thread_start, thread_end, thread_out, thread_keys, thread_values);
231 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
232 thread_start, thread_end, thread_out, thread_keys, thread_values);
233 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
234 thread_start, thread_end, thread_out, thread_keys, thread_values);
236 singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
237 thread_start, thread_end, thread_out, thread_keys, thread_values);
238 singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
239 thread_start, thread_end, thread_out, thread_keys, thread_values);
240 singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
241 thread_start, thread_end, thread_out, thread_keys, thread_values);
245 for (int i = 0; i < kValuesPerThread; i++) {
246 const int idx = kValuesPerThread * threadIdx.x + i;
247 if (idx < numAtoms) {
248 sortIndex[offset + idx] = thread_out[i];
249 sortOrder[offset + idx] = idx;
258 __device__ __forceinline__ int simple_grid(const int numGrids, const int x, const int y, const int z) {
259 const int index = x + numGrids * (y + numGrids * z);
263 __device__ __forceinline__ int snake_grid(const int numGrids, const int x, const int y, const int z) {
264 int index = numGrids * numGrids * x;
266 index += numGrids * y;
268 index += numGrids * (numGrids - 1 - y);
270 if (y % 2 == x % 2) {
273 index += (numGrids - 1 - z);
280 * \brief Computes the nonbonded index of atoms for optimal clustering
283 * Each thread-block is assigned to a patch, and does the following:
284 * - First, the minimum and maximum coordinates of the patch are computed.
285 * the patchMin and patchMax don't produce the same results. I'm not sure
286 * if this is because of migration coords or something with the lattice...
287 * - Seconds, the nonbonded index of each atom is computed and stored
288 * The exact spatial hashing algorithm needs investigation
291 __global__ void computeNonbondedIndex(
292 const int numPatches,
294 const CudaLocalRecord* localRecords,
295 const double3* patchMin,
296 const double3* patchMax,
303 __shared__ CudaLocalRecord s_record;
304 using AccessType = int32_t;
305 AccessType* s_record_buffer = (AccessType*) &s_record;
307 typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
308 __shared__ typename BlockReduce::TempStorage temp_storage;
310 __shared__ double3 s_pmin;
311 __shared__ double3 s_pmax;
313 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
315 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
316 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
320 const int numAtoms = s_record.numAtoms;
321 const int offset = s_record.bufferOffset;
323 // Compute the thread-local min/max values
328 pmax.x = SMALL_FLOAT;
329 pmax.y = SMALL_FLOAT;
330 pmax.z = SMALL_FLOAT;
331 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
332 pmin.x = fmin(pmin.x, pos_x[offset + i]);
333 pmin.y = fmin(pmin.y, pos_y[offset + i]);
334 pmin.z = fmin(pmin.z, pos_z[offset + i]);
335 pmax.x = fmax(pmax.x, pos_x[offset + i]);
336 pmax.y = fmax(pmax.y, pos_y[offset + i]);
337 pmax.z = fmax(pmax.z, pos_z[offset + i]);
341 // Compute the thread-block min/max values
342 pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, cub::Min());
344 pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, cub::Min());
346 pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, cub::Min());
349 pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, cub::Max());
351 pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, cub::Max());
353 pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, cub::Max());
356 if (threadIdx.x == 0) {
365 // Compute the sort index
366 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
367 const double x = pos_x[offset + i];
368 const double y = pos_y[offset + i];
369 const double z = pos_z[offset + i];
371 // Compute subpatch index
372 int idx_x = (int)((x - pmin.x) / (pmax.x - pmin.x) * ((double) numGrids));
373 int idx_y = (int)((y - pmin.y) / (pmax.y - pmin.y) * ((double) numGrids));
374 int idx_z = (int)((z - pmin.z) / (pmax.z - pmin.z) * ((double) numGrids));
375 idx_x = min(max(idx_x, 0), numGrids-1);
376 idx_y = min(max(idx_y, 0), numGrids-1);
377 idx_z = min(max(idx_z, 0), numGrids-1);
379 // Compute sort index
380 const int block_index = snake_grid(numGrids, idx_x, idx_y, idx_z);
381 const double z_norm = (z - pmin.z) / (pmax.z - pmin.z);
383 const int reverse = ((idx_y % 2) == (idx_x % 2));
386 inner_index = (unsigned int) (z_norm * (1 << 16));
388 inner_index = ~((unsigned int) (z_norm * (1 << 16)));
391 sortIndex[offset + i] = (block_index << 17) + inner_index;
392 sortOrder[offset + i] = i;
399 * \brief Sorts the nonbonded sorting based on previously computed indices
402 * Uses CUB's block-level sort algorithm to generate the nonbonded ordering based on
403 * previously computed spatial hashing. It will generate both a forward and backward maping
406 __global__ void sortAtomsKernel(
407 const int numPatches,
408 const CudaLocalRecord* localRecords,
413 __shared__ CudaLocalRecord s_record;
414 using AccessType = int32_t;
415 AccessType* s_record_buffer = (AccessType*) &s_record;
417 typedef cub::BlockRadixSort<int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, int> BlockRadixSort;
418 __shared__ typename BlockRadixSort::TempStorage temp_storage;
420 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
421 // Read in the CudaLocalRecord using multiple threads. This should
423 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
424 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
428 const int numAtoms = s_record.numAtoms;
429 const int offset = s_record.bufferOffset;
431 int thread_keys[MigrationCUDAKernel::kValuesPerThread];
432 int thread_values[MigrationCUDAKernel::kValuesPerThread];
433 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
434 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
435 if (idx < numAtoms) {
436 thread_keys[i] = sortIndex[offset + idx];
437 thread_values[i] = sortOrder[offset + idx];
439 thread_keys[i] = MAX_VALUE;
440 thread_values[i] = -1;
445 BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
448 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
449 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
450 if (thread_keys[i] != MAX_VALUE) {
451 unsortOrder[offset + thread_values[i]] = idx;
452 sortOrder[offset + idx] = thread_values[i];
453 sortIndex[offset + idx] = thread_keys[i];
461 __global__ void printSortOrder(
462 const int numPatches,
463 const CudaLocalRecord* localRecords,
469 __shared__ CudaLocalRecord s_record;
470 using AccessType = int32_t;
471 AccessType* s_record_buffer = (AccessType*) &s_record;
473 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
474 // Read in the CudaLocalRecord using multiple threads. This should
476 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
477 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
481 const int numAtoms = s_record.numAtoms;
482 const int offset = s_record.bufferOffset;
484 if (patchIndex == 0) {
485 if (threadIdx.x == 0) {
486 for (int i = 0; i < numAtoms; i ++) {
487 printf("%d %d %f %f %f\n", i, sortOrder[offset + i], pos_x[offset + i], pos_y[offset+i], pos_z[offset + i]);
495 void MigrationCUDAKernel::sortAtoms(
496 const int numPatches,
498 const CudaLocalRecord* records,
499 const double3* patchMin,
500 const double3* patchMax,
509 constexpr int numThreads = kSortNumThreads;
510 const int numBlocks = numPatches;
512 // Knob for the spatial hashing
513 const int numGrid = 4;
515 computeBisect<<<numBlocks, numThreads, 0, stream>>>(
516 // computeNonbondedIndex<<<numBlocks, numThreads, 0, stream>>>(
529 sortAtomsKernel<<<numBlocks, numThreads, 0, stream>>>(
538 printSortOrder<<<numBlocks, numThreads, 0, stream>>>(
551 * \brief Computes the derived quantities for SoA data
554 * Some data isn't stored in AoS data. This kernel computes quantities that are
555 * used elsewhere in calculation like reciprocal mass
558 __global__ void calculateDerivedKernel(
559 const int numPatches,
560 const CudaLocalRecord* localRecords,
564 __shared__ CudaLocalRecord s_record;
565 using AccessType = int32_t;
566 AccessType* s_record_buffer = (AccessType*) &s_record;
568 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
569 // Read in the CudaLocalRecord using multiple threads. This should
571 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
572 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
576 const int numAtoms = s_record.numAtoms;
577 const int offset = s_record.bufferOffset;
579 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
580 const float m = mass[offset + i];
581 recipMass[offset + i] = (m > 0) ? 1.0f / m : 0.0;
588 * \brief Computes the Langevin derived quantities for SoA data
591 * Some data isn't stored in AoS data. This kernel computes quantities that are
592 * used elsewhere in the Langevin calculation like langScalRandBBK2
595 template<bool kDoAlch>
596 __global__ void calculateDerivedLangevinKernel(
597 const int numPatches,
598 const CudaLocalRecord* localRecords,
601 const double tempFactor,
604 float* langevinParam,
605 float* langScalVelBBK2,
606 float* langScalRandBBK2
608 __shared__ CudaLocalRecord s_record;
609 using AccessType = int32_t;
610 AccessType* s_record_buffer = (AccessType*) &s_record;
612 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
613 // Read in the CudaLocalRecord using multiple threads. This should
615 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
616 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
620 const int numAtoms = s_record.numAtoms;
621 const int offset = s_record.bufferOffset;
623 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
624 const double dt_gamma = dt * langevinParam[offset + i];
625 const float invm = recipMass[offset + i];
626 const double factor = (kDoAlch && partition[offset + i]) ? tempFactor : 1.0;
627 langScalRandBBK2[offset + i] = (float) sqrt( 2.0 * dt_gamma * kbT * factor * invm);
628 langScalVelBBK2[offset + i] = (float) (1.0 / (1.0 + 0.5 * dt_gamma));
637 * \brief Copies AoS data into SoA buffers
640 * During migration, atomic data is stored as AoS. After migration, we must copy
641 * the data to SoA buffers for the rest of the calculation. Since the AoS atomic data
642 * is 128 bytes, we can do this with efficient memory access using shared memory as a
643 * buffer for the transpose
645 * TODO Remove ugly if statement for writing buffers
648 template<bool kDoAlch, bool kDoLangevin>
649 __global__ void copy_AoS_to_SoAKernel(
650 const int numPatches,
651 const CudaLocalRecord* localRecords,
652 const FullAtom* atomdata_AoS,
663 int* hydrogenGroupSize,
664 int* migrationGroupSize,
666 float* rigidBondLength,
671 constexpr int kAtomsPerBuffer = MigrationCUDAKernel::kAtomsPerBuffer;
672 constexpr int kNumSoABuffers = MigrationCUDAKernel::kNumSoABuffers;
674 __shared__ CudaLocalRecord s_record;
675 using AccessType = int32_t;
676 AccessType* s_record_buffer = (AccessType*) &s_record;
678 // FullAtom contains "Vectors" which initialize to 99999. This isn't allow for shared memory
679 // suppressing this warning as it doesn't rely on the dynamic initialization
681 #pragma diag_suppress static_var_with_dynamic_init
682 __shared__ FullAtom s_atombuffer[kAtomsPerBuffer]; // For a 128 byte cache line
684 // NVCC might allow the code above but initializations on shared memory
685 // are not allowed on clang
686 extern __shared__ FullAtom s_atombuffer[];
689 const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
690 const int wid = threadIdx.x / WARPSIZE;
691 const int tid = threadIdx.x % WARPSIZE;
693 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
694 // Read in the CudaLocalRecord using multiple threads. This should
696 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
697 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
701 const int numAtoms = s_record.numAtoms;
702 const int numAtomsPad = (s_record.numAtoms + kAtomsPerBuffer - 1) / kAtomsPerBuffer;
703 const int offset = s_record.bufferOffset;
705 for (int i = 0; i < numAtomsPad; i++) {
706 const int atomOffset = i * kAtomsPerBuffer;
707 // Load atoms. 1 warp per atom
708 for (int atom_idx = wid; atom_idx < kAtomsPerBuffer; atom_idx += warps_per_threadblock) {
709 if (atomOffset + atom_idx < numAtoms) {
710 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
711 int32_t *s_int = (int32_t*) &(s_atombuffer[atom_idx]);
712 int32_t *g_int = (int32_t*) &(atomdata_AoS[offset + atomOffset + atom_idx]);
713 s_int[tid] = g_int[tid];
719 // Store atoms in SoA. 1 thread per atom
720 for (int buffer = wid; buffer < kNumSoABuffers; buffer += warps_per_threadblock) {
721 if (atomOffset + tid < numAtoms) {
722 if (buffer == 0) vel_x [offset + atomOffset + tid] = ((FullAtom)(s_atombuffer[tid])).velocity.x;
723 if (buffer == 1) vel_y [offset + atomOffset + tid] = s_atombuffer[tid].velocity.y;
724 if (buffer == 2) vel_z [offset + atomOffset + tid] = s_atombuffer[tid].velocity.z;
725 if (buffer == 3) pos_x [offset + atomOffset + tid] = s_atombuffer[tid].position.x;
726 if (buffer == 4) pos_y [offset + atomOffset + tid] = s_atombuffer[tid].position.y;
727 if (buffer == 5) pos_z [offset + atomOffset + tid] = s_atombuffer[tid].position.z;
728 if (buffer == 6) mass [offset + atomOffset + tid] = s_atombuffer[tid].mass;
729 if (buffer == 7) charge [offset + atomOffset + tid] = s_atombuffer[tid].charge;
730 if (buffer == 8) id [offset + atomOffset + tid] = s_atombuffer[tid].id;
731 if (buffer == 9) vdwType [offset + atomOffset + tid] = s_atombuffer[tid].vdwType;
732 if (buffer == 10) hydrogenGroupSize [offset + atomOffset + tid] = s_atombuffer[tid].hydrogenGroupSize;
733 if (buffer == 11) migrationGroupSize[offset + atomOffset + tid] = s_atombuffer[tid].migrationGroupSize;
734 if (buffer == 12) atomFixed [offset + atomOffset + tid] = s_atombuffer[tid].atomFixed;
735 if (buffer == 13) rigidBondLength [offset + atomOffset + tid] = s_atombuffer[tid].rigidBondLength;
736 if (buffer == 14) transform [offset + atomOffset + tid] = s_atombuffer[tid].transform;
738 buffer == 15) partition [offset + atomOffset + tid] = s_atombuffer[tid].partition;
740 buffer == 16) langevinParam [offset + atomOffset + tid] = s_atombuffer[tid].langevinParam;
750 void MigrationCUDAKernel::copy_AoS_to_SoA(
751 const int numPatches,
753 const bool langevinOn,
756 const double tempFactor,
757 const CudaLocalRecord* records,
758 const FullAtom* atomdata_AoS,
770 int* hydrogenGroupSize,
771 int* migrationGroupSize,
773 float* rigidBondLength,
776 float* langevinParam,
777 float* langScalVelBBK2,
778 float* langScalRandBBK2,
781 constexpr int numThreads = kSortNumThreads;
782 const int numBlocks = numPatches;
785 constexpr size_t sizeatoms = 0;
787 constexpr size_t sizeatoms = MigrationCUDAKernel::kAtomsPerBuffer*sizeof(FullAtom);
790 #define CALL(alchOn, langevinOn) do { \
792 copy_AoS_to_SoAKernel<alchOn, langevinOn><<<numBlocks, numThreads, sizeatoms, stream>>>( \
796 vel_x, vel_y, vel_z, \
797 pos_x, pos_y, pos_z, \
801 migrationGroupSize, \
811 if (alchOn && langevinOn) {
813 } else if (!alchOn && langevinOn) {
815 } else if (alchOn && !langevinOn) {
823 calculateDerivedKernel<<<numBlocks, numThreads, 0, stream>>>(
830 // This needs the recipMass
833 calculateDerivedLangevinKernel<true><<<numBlocks, numThreads, 0, stream>>>(
844 calculateDerivedLangevinKernel<false><<<numBlocks, numThreads, 0, stream>>>(
860 * \brief Computes the solvent index with the patch
863 * Within a patch, the atoms must be sorted such that solvent atoms are at the end of patch. This sorting
866 * We do this by using a scan to compute the index for the solute atoms. And then another scan to compute
867 * the index for solvent atoms.
871 void computeSolventIndexKernel(
872 const int numPatches,
873 const CudaLocalRecord* localRecords,
874 const FullAtom* atomdata_AoS_in,
877 __shared__ CudaLocalRecord s_record;
878 using AccessType = int32_t;
879 AccessType* s_record_buffer = (AccessType*) &s_record;
881 typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
882 __shared__ typename BlockScan::TempStorage temp_storage;
884 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
885 // Read in the CudaLocalRecord using multiple threads. This should
887 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
888 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
892 const int numAtoms = s_record.numAtoms;
893 const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
895 int thread_input[MigrationCUDAKernel::kValuesPerThread];
896 int thread_soluteIndex[MigrationCUDAKernel::kValuesPerThread];
897 int thread_solventIndex[MigrationCUDAKernel::kValuesPerThread];
900 // Load the isWater value
901 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
902 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
903 if (idx < numAtoms) {
904 thread_input[i] = 1 - atomdata_AoS_in[offsetIn + idx].isWater;
911 // Compute the prefix sum of solvent
912 BlockScan(temp_storage).ExclusiveSum(thread_input, thread_soluteIndex, numSolute);
916 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
917 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
918 if (idx < numAtoms) {
919 thread_input[i] = 1 - thread_input[i];
926 // Compute the prefix sum of water
927 BlockScan(temp_storage).ExclusiveSum(thread_input, thread_solventIndex);
931 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
932 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
933 if (idx < numAtoms) {
934 if (thread_input[i]) {
935 sortIndex[offsetIn + idx] = numSolute + thread_solventIndex[i];
937 sortIndex[offsetIn + idx] = thread_soluteIndex[i];
946 * \brief Sorts atoms into solute-solvent ordering based on previous compute indices
949 * Using the previously computed indices, this kernel moves the atomic data in AoS format
950 * into the desired order. Note, this kernel copies the atomic data from the scratch buffer
951 * into the true atomic AoS buffer. The actual migration data movement copies into the scratch
952 * buffer and this kernel copies back.
954 * Since the AoS data is ~128 bytes per atom. Each warp will move a single atom
957 __global__ void sortSolventKernel(
958 const int numPatches,
959 const CudaLocalRecord* localRecords,
960 const FullAtom* atomdata_AoS_in,
961 FullAtom* atomdata_AoS_out,
964 __shared__ CudaLocalRecord s_record;
965 using AccessType = int32_t;
966 AccessType* s_record_buffer = (AccessType*) &s_record;
968 const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
969 const int wid = threadIdx.x / WARPSIZE;
970 const int tid = threadIdx.x % WARPSIZE;
972 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
973 // Read in the CudaLocalRecord using multiple threads. This should
975 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
976 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
980 const int numAtoms = s_record.numAtoms;
982 // This was causing issues with CUDA11.3. Needed to explicitly make the offset
985 const int64_t offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
986 const int64_t offset = s_record.bufferOffset;
988 const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
989 const int offset = s_record.bufferOffset;
992 for (int i = wid; i < numAtoms; i+=warps_per_threadblock) {
993 const int dst_idx = sortIndex[offsetIn + i];
994 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
995 int32_t *src_int = (int32_t*) &(atomdata_AoS_in[offsetIn + i]);
996 int32_t *dst_int = (int32_t*) &(atomdata_AoS_out[offset + dst_idx]);
997 dst_int[tid] = src_int[tid];
999 #if defined(NAMD_HIP)
1000 NAMD_WARP_SYNC(WARP_FULL_MASK);
1002 WARP_SYNC(WARP_FULL_MASK);
1010 void MigrationCUDAKernel::sortSolventAtoms(
1011 const int numPatches,
1012 const CudaLocalRecord* records,
1013 const FullAtom* atomdata_AoS_in,
1014 FullAtom* atomdata_AoS_out,
1018 constexpr int numThreads = kSortNumThreads;
1019 const int numBlocks = numPatches;
1021 computeSolventIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1028 sortSolventKernel<<<numBlocks, numThreads, 0, stream>>>(
1038 * \brief Computes the migration group index
1041 * Atom migration must occur at the level of migration group. Some molecules are
1042 * moved together. I.e. hydrogen of a water molecular are moved based on oxygen's position
1044 * This kernel computes the number of migration groups per patch as well as their starting
1045 * indices. This will be used later during migration. It does this with a scan operation
1048 __global__ void computeMigrationGroupIndexKernel(
1049 const int numPatches,
1050 CudaLocalRecord* localRecords,
1051 const int* migrationGroupSize,
1052 int* migrationGroupIndex
1054 __shared__ CudaLocalRecord s_record;
1055 using AccessType = int32_t;
1056 AccessType* s_record_buffer = (AccessType*) &s_record;
1058 typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1059 __shared__ typename BlockScan::TempStorage temp_storage;
1061 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1062 // Read in the CudaLocalRecord using multiple threads. This should
1064 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1065 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1069 const int numAtoms = s_record.numAtoms;
1070 const int offset = s_record.bufferOffset;
1072 int thread_size[MigrationCUDAKernel::kValuesPerThread];
1073 int thread_index[MigrationCUDAKernel::kValuesPerThread];
1076 // Load the migration group size
1077 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1078 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1079 if (idx < numAtoms) {
1080 thread_size[i] = migrationGroupSize[offset + idx] ? 1 : 0;
1087 // Compute the prefix sum of solvent
1088 BlockScan(temp_storage).ExclusiveSum(thread_size, thread_index, numGroups);
1091 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1092 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1093 if (thread_size[i] != 0) {
1094 migrationGroupIndex[offset + thread_index[i]] = idx;
1099 if (threadIdx.x == 0) {
1100 localRecords[patchIndex].numMigrationGroups = numGroups;
1107 void MigrationCUDAKernel::computeMigrationGroupIndex(
1108 const int numPatches,
1109 CudaLocalRecord* records,
1110 const int* migrationGroupSize,
1111 int* migrationGroupIndex,
1114 constexpr int numThreads = kSortNumThreads;
1115 const int numBlocks = numPatches;
1117 computeMigrationGroupIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1126 * \brief Computes the transformed positions of migrated atoms
1129 * When atoms move between patches, we need to do a transform on the position based
1130 * on the lattice. This logic is largely copied from HomePatch.C. This is only done
1131 * on the atoms that have actually moved between patches. This uses the numAtomsLocal
1132 * field of the CudaLocalRecord
1134 * Note this kernel could likely be optimized. Threads that are not assigned to the
1135 * parent of a migration group do no work. But not that many atoms actually migrate
1138 __global__ void transformMigratedPositionsKernel(
1139 const int numPatches,
1140 const CudaLocalRecord* localRecords,
1141 const double3* patchCenter,
1142 FullAtom* atomdata_AoS,
1143 const Lattice lattice
1145 __shared__ CudaLocalRecord s_record;
1146 using AccessType = int32_t;
1147 AccessType* s_record_buffer = (AccessType*) &s_record;
1149 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1150 // Read in the CudaLocalRecord using multiple threads. This should
1152 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1153 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1157 const int offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1158 const int numAtoms = s_record.numAtoms;
1159 const int numAtomsLocal = s_record.numAtomsLocal;
1160 const int numAtomsMigrated = numAtoms - numAtomsLocal;
1161 const double3 center = patchCenter[patchIndex];
1163 for (int i = threadIdx.x; i < numAtomsMigrated; i += blockDim.x) {
1164 const int startingIndex = numAtomsLocal + i;
1165 const int migrationSize = atomdata_AoS[offset + startingIndex].migrationGroupSize;
1166 const int hydrogenSize = atomdata_AoS[offset + startingIndex].hydrogenGroupSize;
1168 if (migrationSize == 0) continue;
1170 Transform parent_transform;
1171 if (migrationSize != hydrogenSize) {
1172 double3 c_pos = make_double3(0.0, 0.0, 0.0);
1174 int tmp_hydrogenGroupSize = hydrogenSize;
1175 for (int j = 0; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1176 c_pos = c_pos + (double3) atomdata_AoS[offset + startingIndex + j].position;
1178 tmp_hydrogenGroupSize = atomdata_AoS[offset + startingIndex + j].hydrogenGroupSize;
1180 c_pos.x = c_pos.x / ((double) c);
1181 c_pos.y = c_pos.y / ((double) c);
1182 c_pos.z = c_pos.z / ((double) c);
1184 parent_transform = atomdata_AoS[offset+startingIndex].transform;
1185 c_pos = lattice.nearest(c_pos, center, &parent_transform);
1187 double3 pos = atomdata_AoS[offset+startingIndex].position;
1188 Transform transform = atomdata_AoS[offset+startingIndex].transform;
1190 pos = lattice.reverse_transform(pos, transform);
1191 pos = lattice.apply_transform(pos, parent_transform);
1193 atomdata_AoS[offset+startingIndex].transform = parent_transform;
1194 atomdata_AoS[offset+startingIndex].position = pos;
1197 double3 pos = atomdata_AoS[offset+startingIndex].position;
1198 Transform transform = atomdata_AoS[offset+startingIndex].transform;
1200 pos = lattice.nearest(pos, center, &transform);
1201 parent_transform = transform;
1203 atomdata_AoS[offset+startingIndex].transform = transform;
1204 atomdata_AoS[offset+startingIndex].position = pos;
1206 for (int j = 1; j < migrationSize; j++) {
1207 double3 pos = atomdata_AoS[offset+startingIndex + j].position;
1208 Transform transform = atomdata_AoS[offset+startingIndex +j].transform;
1210 pos = lattice.reverse_transform(pos, transform);
1211 pos = lattice.apply_transform(pos, parent_transform);
1213 atomdata_AoS[offset+startingIndex+j].transform = parent_transform;
1214 atomdata_AoS[offset+startingIndex+j].position = pos;
1221 void MigrationCUDAKernel::transformMigratedPositions(
1222 const int numPatches,
1223 const CudaLocalRecord* localRecords,
1224 const double3* patchCenter,
1225 FullAtom* atomdata_AoS,
1226 const Lattice lattice,
1229 constexpr int numThreads = kSortNumThreads;
1230 const int numBlocks = numPatches;
1232 transformMigratedPositionsKernel<<<numBlocks, numThreads, 0, stream>>>(
1243 * \brief Computes the new location of a migration group
1246 * Computes the migration destination for each migration group. This info includes:
1247 * - Destination patch ID
1248 * - Destination patch local index (this is the home patches index)
1249 * - Destination device
1251 * While the information is computed at the migration group level, it is stored for each atom
1252 * in a migration group, so some of the later data movement can happen at the per atom level
1254 * Note the migrationDestination structure will eventually include the atoms new index within a
1255 * patch, but that is determined later
1259 computeMigrationDestinationKernel(
1260 const int numPatches,
1261 CudaLocalRecord* localRecords,
1262 const Lattice lattice,
1263 const CudaMInfo* mInfo,
1264 const int* patchToDeviceMap,
1265 const int* globalToLocalMap,
1266 const double3* patchMin,
1267 const double3* patchMax,
1268 const int* hydrogenGroupSize,
1269 const int* migrationGroupSize,
1270 const int* migrationGroupIndex,
1271 const double* pos_x,
1272 const double* pos_y,
1273 const double* pos_z,
1274 int4* migrationDestination
1276 __shared__ CudaLocalRecord s_record;
1277 using AccessType = int32_t;
1278 AccessType* s_record_buffer = (AccessType*) &s_record;
1280 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1281 // Read in the CudaLocalRecord using multiple threads. This should
1283 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1284 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1287 // Clear migration atom count
1288 if (threadIdx.x == 0) {
1289 localRecords[patchIndex].numAtomsNew = 0;
1293 const int numMigrationGroups = s_record.numMigrationGroups;
1294 const int offset = s_record.bufferOffset;
1295 const double3 min = patchMin[patchIndex];
1296 const double3 max = patchMax[patchIndex];
1297 CudaMInfo migration_info = mInfo[patchIndex];
1299 int xdev, ydev, zdev;
1301 for (int i = threadIdx.x; i < numMigrationGroups; i += blockDim.x) {
1303 const int startingIndex = migrationGroupIndex[offset + i];
1304 const int migrationSize = migrationGroupSize[offset + startingIndex];
1305 const int hydrogenSize = hydrogenGroupSize[offset + startingIndex];
1308 pos.x = pos_x[offset + startingIndex];
1309 pos.y = pos_y[offset + startingIndex];
1310 pos.z = pos_z[offset + startingIndex];
1312 if (migrationSize != hydrogenSize) {
1314 int tmp_hydrogenGroupSize = 1;
1315 for (int j = hydrogenSize; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1316 pos.x = pos.x + pos_x[offset + startingIndex + j];
1317 pos.y = pos.y + pos_y[offset + startingIndex + j];
1318 pos.z = pos.z + pos_z[offset + startingIndex + j];
1320 tmp_hydrogenGroupSize = hydrogenGroupSize[offset + startingIndex + j];
1322 pos.x = pos.x / ((double) c);
1323 pos.y = pos.y / ((double) c);
1324 pos.z = pos.z / ((double) c);
1327 double3 s = lattice.scale(pos);
1329 if (s.x < min.x) xdev = 0;
1330 else if (max.x <= s.x) xdev = 2;
1333 if (s.y < min.y) ydev = 0;
1334 else if (max.y <= s.y) ydev = 2;
1337 if (s.z < min.z) zdev = 0;
1338 else if (max.z <= s.z) zdev = 2;
1341 int dest_patch = migration_info.destPatchID[xdev][ydev][zdev];
1342 int dest_device = patchToDeviceMap[dest_patch];
1343 int dest_localID = globalToLocalMap[dest_patch];
1345 dest_info.x = dest_patch;
1346 dest_info.y = dest_device;
1347 dest_info.z = dest_localID;
1350 for (int j = 0; j < migrationSize; j++) {
1351 migrationDestination[offset + startingIndex + j] = dest_info;
1358 void MigrationCUDAKernel::computeMigrationDestination(
1359 const int numPatches,
1360 CudaLocalRecord* localRecords,
1361 const Lattice lattice,
1362 const CudaMInfo* mInfo,
1363 const int* patchToDeviceMap,
1364 const int* globalToLocalMap,
1365 const double3* patchMin,
1366 const double3* patchMax,
1367 const int* hydrogenGroupSize,
1368 const int* migrationGroupSize,
1369 const int* migrationGroupIndex,
1370 const double* pos_x,
1371 const double* pos_y,
1372 const double* pos_z,
1373 int4* migrationDestination,
1376 constexpr int numThreads = kSortNumThreads;
1377 const int numBlocks = numPatches;
1379 computeMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1390 migrationGroupIndex,
1391 pos_x, pos_y, pos_z,
1392 migrationDestination
1398 * \brief Updates AoS data structure before migration
1401 * Copies the necessary data from the SoA buffers to the AoS buffers
1402 * This is just the positions and velocities
1404 * TODO optimize this kernel
1407 __global__ void update_AoSKernel(
1408 const int numPatches,
1409 const CudaLocalRecord* localRecords,
1410 FullAtom* atomdata_AoS,
1411 const double* vel_x,
1412 const double* vel_y,
1413 const double* vel_z,
1414 const double* pos_x,
1415 const double* pos_y,
1418 __shared__ CudaLocalRecord s_record;
1419 using AccessType = int32_t;
1420 AccessType* s_record_buffer = (AccessType*) &s_record;
1422 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1423 // Read in the CudaLocalRecord using multiple threads. This should
1425 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1426 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1430 const int numAtoms = s_record.numAtoms;
1431 const int offset = s_record.bufferOffset;
1433 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1435 pos.x = pos_x[offset + i];
1436 pos.y = pos_y[offset + i];
1437 pos.z = pos_z[offset + i];
1440 vel.x = vel_x[offset + i];
1441 vel.y = vel_y[offset + i];
1442 vel.z = vel_z[offset + i];
1444 atomdata_AoS[offset + i].position = pos;
1445 atomdata_AoS[offset + i].velocity = vel;
1451 void MigrationCUDAKernel::update_AoS(
1452 const int numPatches,
1453 const CudaLocalRecord* records,
1454 FullAtom* atomdata_AoS,
1455 const double* vel_x,
1456 const double* vel_y,
1457 const double* vel_z,
1458 const double* pos_x,
1459 const double* pos_y,
1460 const double* pos_z,
1463 constexpr int numThreads = kSortNumThreads;
1464 const int numBlocks = numPatches;
1466 update_AoSKernel<<<numBlocks, numThreads, 0, stream>>>(
1470 vel_x, vel_y, vel_z,
1477 * \brief Copies atoms that aren't "moving" into the scratch buffers
1480 * This kernel will copy the atoms that are not moving into a new patch into that
1481 * patches scratch buffers. This uses a scan operation to eliminate the need for
1482 * atomic operations to compute atom's new location.
1484 * This scan will not change the order of the atoms within a migration group
1486 * Note this will set the localID of the atom in the migrationDestination
1487 * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1490 __global__ void performLocalMigrationKernel(
1491 const int numPatches,
1492 CudaLocalRecord* localRecords,
1493 const FullAtom* atomdata_AoS_in,
1494 FullAtom* atomdata_AoS_out,
1495 int4* migrationDestination
1497 __shared__ CudaLocalRecord s_record;
1498 using AccessType = int32_t;
1499 AccessType* s_record_buffer = (AccessType*) &s_record;
1501 typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1502 __shared__ typename BlockScan::TempStorage temp_storage;
1504 __shared__ int s_local_index[MigrationCUDAKernel::kSortNumThreads * MigrationCUDAKernel::kValuesPerThread];
1506 const int warps_per_threadblock = blockDim.x / WARPSIZE;
1507 const int wid = threadIdx.x / WARPSIZE;
1508 const int tid = threadIdx.x % WARPSIZE;
1510 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1511 // Read in the CudaLocalRecord using multiple threads. This should
1513 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1514 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1518 const int patchID = s_record.patchID;
1519 const int numAtoms = s_record.numAtoms;
1520 const int offset = s_record.bufferOffset;
1521 const int dst_offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1524 // Load in if the atom is staying in the same patch
1525 int thread_input[MigrationCUDAKernel::kValuesPerThread];
1526 int thread_output[MigrationCUDAKernel::kValuesPerThread];
1528 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1529 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1530 if (idx < numAtoms && migrationDestination[offset + idx].x == patchID) {
1531 thread_input[i] = 1;
1533 thread_input[i] = 0;
1538 // Compute the prefix sum of local
1539 BlockScan(temp_storage).ExclusiveSum(thread_input, thread_output, numLocal);
1542 for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1543 const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1544 if (idx < numAtoms && thread_input[i]) {
1545 s_local_index[thread_output[i]] = idx;
1548 if (threadIdx.x == 0) {
1549 localRecords[patchIndex].numAtomsLocal = numLocal;
1550 localRecords[patchIndex].numAtomsNew = numLocal;
1554 // We can do this at the atom level instead of the migrationGroup level
1555 // because the migrationDestinations take that into account and
1556 // the prefix sum means we do not change the ordering of local atoms
1557 for (int i = wid; i < numLocal; i += warps_per_threadblock) {
1558 const int src_atom_idx = s_local_index[i];
1559 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1560 int32_t *src_int = (int32_t*) &(atomdata_AoS_in [offset + src_atom_idx]);
1561 int32_t *dst_int = (int32_t*) &(atomdata_AoS_out [dst_offset + i]);
1562 dst_int[tid] = src_int[tid];
1565 migrationDestination[offset + src_atom_idx].w = i;
1567 #if defined(NAMD_HIP)
1568 NAMD_WARP_SYNC(WARP_FULL_MASK);
1570 WARP_SYNC(WARP_FULL_MASK);
1577 void MigrationCUDAKernel::performLocalMigration(
1578 const int numPatches,
1579 CudaLocalRecord* records,
1580 const FullAtom* atomdata_AoS_in,
1581 FullAtom* atomdata_AoS_out,
1582 int4* migrationDestination,
1585 constexpr int numThreads = kSortNumThreads;
1586 const int numBlocks = numPatches;
1588 performLocalMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1593 migrationDestination
1599 * \brief Moves the migrating atoms into their new patches
1602 * Copies the atoms which have moved into new patches
1603 * into those patches scratch buffers. This will use atomic operations to
1604 * determine the new index of each migration group. This happens at the migration
1605 * group level so atoms within a migration group stay in the same order.
1607 * Note because not many atoms move, the use of atomics isn't too expensive
1608 * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1611 __global__ void performMigrationKernel(
1612 const int numPatches,
1613 CudaLocalRecord* localRecords,
1614 CudaLocalRecord** peer_records,
1615 const FullAtom* local_atomdata_AoS,
1616 FullAtom** peer_atomdata_AoS,
1617 const int* migrationGroupSize,
1618 const int* migrationGroupIndex,
1619 int4* migrationDestination
1621 __shared__ CudaLocalRecord s_record;
1622 using AccessType = int32_t;
1623 AccessType* s_record_buffer = (AccessType*) &s_record;
1625 const int warps_per_threadblock = blockDim.x / WARPSIZE;
1626 const int wid = threadIdx.x / WARPSIZE;
1627 const int tid = threadIdx.x % WARPSIZE;
1629 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1630 // Read in the CudaLocalRecord using multiple threads. This should
1632 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1633 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1637 const int numMigrationGroups = s_record.numMigrationGroups;
1638 const int offset = s_record.bufferOffset;
1639 const int patchID = s_record.patchID;
1643 for (int i = wid; i < numMigrationGroups; i += warps_per_threadblock) {
1644 const int startingIndex = migrationGroupIndex[offset + i];
1646 const int size = migrationGroupSize[offset + startingIndex];
1648 const int4 migrationInfo = migrationDestination[offset + startingIndex];
1649 const int dst_patchID = migrationInfo.x;
1650 const int dst_device = migrationInfo.y;
1651 const int dst_localID = migrationInfo.z;
1652 const int dst_offset = dst_localID * MigrationCUDAKernel::kMaxAtomsPerPatch;
1654 if (dst_patchID == patchID) continue;
1656 // Get index via atomic operation
1659 int* counter_index = &(peer_records[dst_device][dst_localID].numAtomsNew);
1660 #if __CUDA_ARCH__ >= 600 || defined(NAMD_HIP)
1661 dst_atom_idx = atomicAdd_system(counter_index, size);
1663 // support single-GPU Maxwell
1664 dst_atom_idx = atomicAdd(counter_index, size);
1667 dst_atom_idx = WARP_SHUFFLE(WARP_FULL_MASK, dst_atom_idx, 0, WARPSIZE);
1670 FullAtom* remote_atomdata_AoS = peer_atomdata_AoS[dst_device];
1671 for (int j = 0; j < size; j++) {
1672 if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1673 int32_t *src_int = (int32_t*) &(local_atomdata_AoS [offset + startingIndex + j]);
1674 int32_t *dst_int = (int32_t*) &(remote_atomdata_AoS [dst_offset + dst_atom_idx + j]);
1675 dst_int[tid] = src_int[tid];
1678 migrationDestination[offset + startingIndex + j].w = dst_atom_idx + j;
1680 #if defined(NAMD_HIP)
1681 NAMD_WARP_SYNC(WARP_FULL_MASK);
1683 WARP_SYNC(WARP_FULL_MASK);
1692 void MigrationCUDAKernel::performMigration(
1693 const int numPatches,
1694 CudaLocalRecord* records,
1695 CudaLocalRecord** peer_records,
1696 const FullAtom* local_atomdata_AoS,
1697 FullAtom** peer_atomdata_AoS,
1698 const int* migrationGroupSize,
1699 const int* migrationGroupIndex,
1700 int4* migrationDestination,
1703 constexpr int numThreads = kSortNumThreads;
1704 const int numBlocks = numPatches;
1706 performMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1713 migrationGroupIndex,
1714 migrationDestination
1720 * \brief Updates migrationDestination array based on solute-solvent order
1723 * The migrationDestation structure is used later by the tuple migration, so
1724 * we need to keep it accurate after the solute-solvent sorting. This uses peer
1725 * access to update each atom's new index within its new patch.
1728 __global__ void updateMigrationDestinationKernel(
1729 const int numAtomsHome,
1730 int4* migrationDestination,
1731 int** d_peer_sortSoluteIndex
1733 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1734 for (int i = tid; i < numAtomsHome; i += blockDim.x * gridDim.x) {
1735 const int4 dest = migrationDestination[i];
1736 const int dst_device = dest.y;
1737 const int dst_patchID = dest.z;
1738 const int dst_patchOffset = MigrationCUDAKernel::kMaxAtomsPerPatch * dst_patchID;
1739 const int inputIndex = dest.w;
1741 const int outputIndex = d_peer_sortSoluteIndex[dst_device][dst_patchOffset + inputIndex];
1742 migrationDestination[i].w = outputIndex;
1746 void MigrationCUDAKernel::updateMigrationDestination(
1747 const int numAtomsHome,
1748 int4* migrationDestination,
1749 int** d_peer_sortSoluteIndex,
1752 constexpr int numThreads = kSortNumThreads;
1753 const int numBlocks = (numAtomsHome + kSortNumThreads - 1) / kSortNumThreads;
1755 updateMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1757 migrationDestination,
1758 d_peer_sortSoluteIndex
1764 * \brief Copies static atomic data to proxy patches
1767 * Copies atomic data that does not change to proxy patches.
1770 template <bool doAlch>
1771 __global__ void copyDataToProxiesKernel(
1772 const int deviceIndex,
1773 const int numPatchesHome,
1774 const int numPatchesHomeAndProxy,
1775 const CudaLocalRecord* localRecords,
1778 int** peer_sortOrder,
1779 int** peer_unsortOrder,
1780 float** peer_charge,
1781 int** peer_partition,
1782 double3** peer_patchCenter
1784 __shared__ CudaLocalRecord s_record;
1785 using AccessType = int32_t;
1786 AccessType* s_record_buffer = (AccessType*) &s_record;
1788 for (int patchIndex = blockIdx.x + numPatchesHome; patchIndex < numPatchesHomeAndProxy; patchIndex += gridDim.x) {
1789 // Read in the CudaLocalRecord using multiple threads. This should
1791 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1792 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1796 const int numAtoms = s_record.numAtoms;
1797 const int dstOffset = s_record.bufferOffset;
1798 const int srcDevice = s_record.inline_peers[0].deviceIndex;
1799 const int srcOffset = s_record.inline_peers[0].bufferOffset;
1800 const int srcPatchIndex = s_record.inline_peers[0].patchIndex;
1802 // TODO this data is probably on the host somewhere...
1803 // And it probably doesn't change????
1804 if (threadIdx.x == 0) {
1805 peer_patchCenter[deviceIndex][patchIndex] = peer_patchCenter[srcDevice][srcPatchIndex];
1809 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1810 peer_id [deviceIndex][dstOffset + i] = peer_id[srcDevice][srcOffset + i];
1811 peer_vdwType [deviceIndex][dstOffset + i] = peer_vdwType[srcDevice][srcOffset + i];
1812 peer_sortOrder [deviceIndex][dstOffset + i] = peer_sortOrder[srcDevice][srcOffset + i];
1813 peer_unsortOrder[deviceIndex][dstOffset + i] = peer_unsortOrder[srcDevice][srcOffset + i];
1814 peer_charge [deviceIndex][dstOffset + i] = peer_charge[srcDevice][srcOffset + i];
1816 peer_partition[deviceIndex][dstOffset + i] = peer_partition[srcDevice][srcOffset + i];
1823 void MigrationCUDAKernel::copyDataToProxies(
1824 const int deviceIndex,
1825 const int numPatchesHome,
1826 const int numPatchesHomeAndProxy,
1827 const CudaLocalRecord* records,
1830 int** peer_sortOrder,
1831 int** peer_unsortOrder,
1832 float** peer_charge,
1833 int** peer_partition,
1834 double3** peer_patchCenter,
1838 constexpr int numThreads = kSortNumThreads;
1839 const int numPatches = numPatchesHomeAndProxy - numPatchesHome;
1840 const int numBlocks = numPatches;
1843 copyDataToProxiesKernel<true><<<numBlocks, numThreads, 0, stream>>>(
1845 numPatchesHome, numPatchesHomeAndProxy,
1847 peer_id, peer_vdwType, peer_sortOrder,
1848 peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1852 copyDataToProxiesKernel<false><<<numBlocks, numThreads, 0, stream>>>(
1854 numPatchesHome, numPatchesHomeAndProxy,
1856 peer_id, peer_vdwType, peer_sortOrder,
1857 peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1863 * \brief Copies migrationDestination to proxy patches
1866 * This copies the migrationDestination to proxies to be used in tuple migration
1867 * This needs to use the old bufferOffsets and atom count so it is separate from the
1868 * other copyDataToProxies kernel. Additionally, we do this as a put operation to
1869 * avoid a node barrier. The home patch produces this data and then it needs to write it
1870 * to the proxy patch; it the device which owns the proxy patch was doing a get operation,
1871 * we'd need to have a node barrier to make sure this data was ready to be ready. By using
1872 * put operation, we can do a device local synchonization (i.e. a new kernel launch) to
1873 * make sure the data is ready
1876 __global__ void copyMigrationDestinationToProxiesKernel(
1877 const int deviceIndex,
1878 const int numPatchesHome,
1879 const int numPatchesHomeAndProxy,
1880 const CudaLocalRecord* localRecords,
1881 const CudaPeerRecord* peerRecords,
1882 int4** peer_migrationDestination
1884 __shared__ CudaLocalRecord s_record;
1885 using AccessType = int32_t;
1886 AccessType* s_record_buffer = (AccessType*) &s_record;
1888 for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
1889 // Read in the CudaLocalRecord using multiple threads. This should
1891 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1892 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1896 const int numAtoms = s_record.numAtomsNew;
1897 const int dstOffset = s_record.bufferOffsetOld;
1898 const int numPeerRecords = s_record.numPeerRecord;
1899 const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
1901 const int inlineRemotes = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
1903 for (int remote = 0; remote < inlineRemotes; remote++) {
1904 const int srcDevice = s_record.inline_peers[remote].deviceIndex;
1905 const int srcOffset = s_record.inline_peers[remote].bufferOffset;
1906 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1907 peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1911 for (int remote = inlineRemotes; remote < numPeerRecords; remote++) {
1912 const int srcDevice = peerRecords[peerRecordStartIndexs + remote].deviceIndex;
1913 const int srcOffset = peerRecords[peerRecordStartIndexs + remote].bufferOffset;
1914 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1915 peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1922 void MigrationCUDAKernel::copyMigrationDestinationToProxies(
1923 const int deviceIndex,
1924 const int numPatchesHome,
1925 const int numPatchesHomeAndProxy,
1926 const CudaLocalRecord* records,
1927 const CudaPeerRecord* peerRecords,
1928 int4** peer_migrationDestination,
1931 constexpr int numThreads = kSortNumThreads;
1932 const int numBlocks = numPatchesHome;
1934 copyMigrationDestinationToProxiesKernel<<<numBlocks, numThreads, 0, stream>>>(
1936 numPatchesHome, numPatchesHomeAndProxy,
1939 peer_migrationDestination
1944 * \brief Updates the CudaLocalRecord data structure
1947 * This updates the number of atoms and buffer offsets of the patches. This should only
1948 * be ran on the home patches since it uses peer access to update remote structures
1951 __global__ void updateLocalRecordsKernel(
1952 const int numPatchesHome,
1953 CudaLocalRecord* localRecords,
1954 CudaLocalRecord** peer_records,
1955 const CudaPeerRecord* peerRecords
1957 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1958 for (int i = tid; i < numPatchesHome; i += blockDim.x * gridDim.x) {
1959 const int numAtomsOld = localRecords[i].numAtoms;
1960 const int numAtoms = localRecords[i].numAtomsNew;
1961 const int numAtomsNBPad = CudaComputeNonbondedKernel::computeAtomPad(numAtoms);
1962 localRecords[i].numAtoms = numAtoms;
1963 localRecords[i].numAtomsNBPad = numAtomsNBPad;
1964 localRecords[i].bufferOffsetOld = localRecords[i].bufferOffset;
1965 localRecords[i].numAtomsNew = numAtomsOld;
1967 const int numPeerRecord = localRecords[i].numPeerRecord;
1968 const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
1969 // TODO use inline remotes??
1971 for (int remote = 0; remote < numPeerRecord; remote++) {
1972 const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
1973 const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
1974 peer_records[srcDevice][srcOffset].numAtoms = numAtoms;
1975 peer_records[srcDevice][srcOffset].numAtomsNBPad = numAtomsNBPad;
1980 void MigrationCUDAKernel::updateLocalRecords(
1981 const int numPatchesHome,
1982 CudaLocalRecord* records,
1983 CudaLocalRecord** peer_records,
1984 const CudaPeerRecord* peerRecords,
1987 const int numThreads = kSortNumThreads;
1988 const int numBlocks = (numPatchesHome + kSortNumThreads - 1) / kSortNumThreads;
1990 updateLocalRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
2000 struct RecordToNumAtoms {
2001 __host__ __device__ __forceinline__
2002 int operator()(const CudaLocalRecord &a) const {
2007 struct RecordTonumAtomsNBPad {
2008 __host__ __device__ __forceinline__
2009 int operator()(const CudaLocalRecord &a) const {
2010 return a.numAtomsNBPad;
2015 * \brief Updates the buffer offsets based on scratch patchOffsets
2018 __global__ void updateLocalRecordsOffsetKernel(
2019 const int numPatchesHomeAndProxy,
2020 CudaLocalRecord* localRecords,
2024 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2025 for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2026 localRecords[i].bufferOffset = patchOffsets[i];
2027 localRecords[i].bufferOffsetNBPad = patchOffsetsNB[i];
2031 void MigrationCUDAKernel::updateLocalRecordsOffset(
2032 const int numPatchesHomeAndProxy,
2033 CudaLocalRecord* records,
2036 const int numThreads = kSortNumThreads;
2037 const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2038 size_t temp = patchDeviceScan_alloc;
2041 // Integration Offsets
2043 RecordToNumAtoms conversion_op;
2044 using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2045 InputIter iter(records, conversion_op);
2047 cub::DeviceScan::ExclusiveSum<InputIter>(
2048 d_patchDeviceScan_scratch, temp,
2049 iter, d_patchOffset_temp, numPatchesHomeAndProxy, stream
2053 // Nonbonded Offsets
2055 RecordTonumAtomsNBPad conversion_op_nb;
2056 using InputIterNB = cub::TransformInputIterator<int, RecordTonumAtomsNBPad, CudaLocalRecord*>;
2057 InputIterNB iterNB(records, conversion_op_nb);
2059 cub::DeviceScan::ExclusiveSum<InputIterNB>(
2060 d_patchDeviceScan_scratch, temp,
2061 iterNB, d_patchOffsetNB_temp, numPatchesHomeAndProxy, stream
2067 updateLocalRecordsOffsetKernel<<<numBlocks, numThreads, 0, stream>>>(
2068 numPatchesHomeAndProxy,
2071 d_patchOffsetNB_temp
2076 * \brief Updates the remote records on this device
2079 __global__ void updatePeerRecordsKernel(
2080 const int numPatchesHomeAndProxy,
2081 CudaLocalRecord* localRecords,
2082 CudaLocalRecord** peer_records,
2083 CudaPeerRecord* peerRecords
2085 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2086 for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2087 const int numPeerRecord = localRecords[i].numPeerRecord;
2088 const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
2090 for (int remote = 0; remote < numPeerRecord; remote++) {
2091 const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
2092 const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
2094 const int bufferOffset = peer_records[srcDevice][srcOffset].bufferOffset;
2095 const int bufferOffsetNBPad = peer_records[srcDevice][srcOffset].bufferOffsetNBPad;
2097 peerRecords[peerRecordStartIndex + remote].bufferOffset = bufferOffset;
2098 peerRecords[peerRecordStartIndex + remote].bufferOffsetNBPad = bufferOffsetNBPad;
2100 if (remote < CudaLocalRecord::num_inline_peer) {
2101 localRecords[i].inline_peers[remote].bufferOffset = bufferOffset;
2102 localRecords[i].inline_peers[remote].bufferOffsetNBPad = bufferOffsetNBPad;
2108 void MigrationCUDAKernel::updatePeerRecords(
2109 const int numPatchesHomeAndProxy,
2110 CudaLocalRecord* records,
2111 CudaLocalRecord** peer_records,
2112 CudaPeerRecord* peerRecords,
2115 const int numThreads = kSortNumThreads;
2116 const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2118 updatePeerRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
2119 numPatchesHomeAndProxy,
2126 MigrationCUDAKernel::MigrationCUDAKernel() {
2127 d_patchOffset_temp = NULL;
2128 d_patchOffsetNB_temp = NULL;
2130 patchDeviceScan_alloc = 0;
2131 d_patchDeviceScan_scratch = NULL;
2134 MigrationCUDAKernel::~MigrationCUDAKernel() {
2135 if (d_patchOffset_temp != NULL) deallocate_device<int>(&d_patchOffset_temp);
2136 if (d_patchOffsetNB_temp != NULL) deallocate_device<int>(&d_patchOffsetNB_temp);
2137 if (d_patchDeviceScan_scratch != NULL) deallocate_device<char>(&d_patchDeviceScan_scratch);
2140 void MigrationCUDAKernel::allocateScratch(const int numPatchesHomeAndProxy) {
2141 allocate_device<int>(&d_patchOffset_temp, numPatchesHomeAndProxy);
2142 allocate_device<int>(&d_patchOffsetNB_temp, numPatchesHomeAndProxy);
2144 // Fake call to CUB to get required size
2145 RecordToNumAtoms conversion_op;
2146 using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2147 InputIter iter(NULL, conversion_op);
2149 void *d_temp_storage = NULL;
2150 size_t temp_storage_bytes = 0;
2152 cub::DeviceScan::ExclusiveSum<InputIter>(
2153 d_temp_storage, temp_storage_bytes,
2154 iter, d_patchOffset_temp, numPatchesHomeAndProxy
2157 patchDeviceScan_alloc = temp_storage_bytes;
2158 allocate_device<char>(&d_patchDeviceScan_scratch, patchDeviceScan_alloc);
2160 #endif // NODEGROUP_FORCE_REGISTER