NAMD
MigrationCUDAKernel.cu
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #if __CUDACC_VER_MAJOR__ >= 11
3 #include <cub/cub.cuh>
4 #else
5 #include <namd_cub/cub.cuh>
6 #endif
7 #else // NAMD_HIP
8 #include <hip/hip_runtime.h>
9 #include <hipcub/hipcub.hpp>
10 #define cub hipcub
11 #endif
12 
13 #include "HipDefines.h"
14 
15 #include "MigrationCUDAKernel.h"
16 #include "CudaComputeNonbondedKernel.h"
17 #include "CudaComputeNonbondedKernel.hip.h"
18 
19 #ifdef NODEGROUP_FORCE_REGISTER
20 
21 #define MAX_VALUE 2147483647
22 #define BIG_FLOAT 1e20
23 #define SMALL_FLOAT -1e20
24 
25 __device__ __forceinline__ void singleBisect(
26  const int bit_pos,
27  const int bit_total,
28  const double min_dim,
29  const double max_dim,
30  const double* current_dim,
31  const int numAtoms,
32  unsigned int* s_indexBuffer,
33  unsigned int (&thread_start)[MigrationCUDAKernel::kValuesPerThread],
34  unsigned int (&thread_end)[MigrationCUDAKernel::kValuesPerThread],
35  unsigned int (&thread_out)[MigrationCUDAKernel::kValuesPerThread],
36  unsigned int (&thread_keys)[MigrationCUDAKernel::kValuesPerThread],
37  unsigned int (&thread_values)[MigrationCUDAKernel::kValuesPerThread]
38 ) {
39  constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
40  typedef cub::BlockRadixSort<unsigned int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, unsigned int> BlockRadixSort;
41  __shared__ typename BlockRadixSort::TempStorage temp_sort;
42 
43 
44  for (int i = 0; i < kValuesPerThread; i++) {
45  const int idx = kValuesPerThread * threadIdx.x + i;
46  if (idx < numAtoms) {
47  const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
48  thread_keys[i] = thread_out[i] | pos_norm;
49  thread_values[i] = idx;
50  } else {
51  thread_keys[i] = ~0;
52  thread_values[i] = ~0;
53  }
54  }
55  __syncthreads();
56 
57  BlockRadixSort(temp_sort).Sort(thread_keys, thread_values);
58  __syncthreads();
59 
60  for (int i = 0; i < kValuesPerThread; i++) {
61  const int idx = kValuesPerThread * threadIdx.x + i;
62  if (thread_values[i] < numAtoms) {
63  s_indexBuffer[thread_values[i]] = idx;
64  }
65  }
66  __syncthreads();
67 
68  // Get sort index
69  for (int i = 0; i < kValuesPerThread; i++) {
70  const int idx = kValuesPerThread * threadIdx.x + i;
71  if (idx < numAtoms) {
72  const int newIndex = s_indexBuffer[idx];
73 
74  int split_factor = 32;
75  int split = -1;
76  while (split == -1) {
77  if ( thread_start[i]/split_factor < (thread_end[i]-1)/split_factor ) {
78  split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
79  }
80  split_factor /= 2;
81  if (split_factor == 1){
82  split = ((thread_start[i] + thread_end[i] + split_factor) / (split_factor*2)) * split_factor;
83  }
84  }
85 
86  if (newIndex >= split) {
87  thread_out[i] |= 1 << (30 - bit_pos);
88  thread_start[i] = split;
89  } else {
90  thread_end[i] = split;
91  }
92  if (bit_pos == bit_total - 1) {
93  const unsigned int pos_norm = (unsigned int) ((current_dim[idx] - min_dim) / (max_dim - min_dim) * (1 << 18));
94  thread_out[i] |= pos_norm;
95  }
96  }
97  }
98  __syncthreads();
99 }
100 
101 
102 __global__ void computeBisect(
103  const int numPatches,
104  const int numGrids,
105  const CudaLocalRecord* localRecords,
106  const double3* patchMin,
107  const double3* patchMax,
108  const double* pos_x,
109  const double* pos_y,
110  const double* pos_z,
111  int* sortOrder,
112  int* sortIndex
113 ) {
114  constexpr int kValuesPerThread = MigrationCUDAKernel::kValuesPerThread;
115 
116  __shared__ CudaLocalRecord s_record;
117  using AccessType = int32_t;
118  AccessType* s_record_buffer = (AccessType*) &s_record;
119 
120  typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
121  __shared__ typename BlockReduce::TempStorage temp_storage;
122 
123  __shared__ double3 s_pmin;
124  __shared__ double3 s_pmax;
125 
126 
127  __shared__ unsigned int s_indexBuffer[MigrationCUDAKernel::kMaxAtomsPerPatch];
128 
129  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
130  #pragma unroll
131  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
132  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
133  }
134  __syncthreads();
135 
136  const int numAtoms = s_record.numAtoms;
137  const int offset = s_record.bufferOffset;
138 
139  // Compute the thread-local min/max values
140  double3 pmin, pmax;
141  pmin.x = BIG_FLOAT;
142  pmin.y = BIG_FLOAT;
143  pmin.z = BIG_FLOAT;
144  pmax.x = SMALL_FLOAT;
145  pmax.y = SMALL_FLOAT;
146  pmax.z = SMALL_FLOAT;
147  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
148  pmin.x = fmin(pmin.x, pos_x[offset + i]);
149  pmin.y = fmin(pmin.y, pos_y[offset + i]);
150  pmin.z = fmin(pmin.z, pos_z[offset + i]);
151  pmax.x = fmax(pmax.x, pos_x[offset + i]);
152  pmax.y = fmax(pmax.y, pos_y[offset + i]);
153  pmax.z = fmax(pmax.z, pos_z[offset + i]);
154  }
155  __syncthreads();
156 
157  // Compute the thread-block min/max values
158  pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, cub::Min());
159  __syncthreads();
160  pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, cub::Min());
161  __syncthreads();
162  pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, cub::Min());
163  __syncthreads();
164 
165  pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, cub::Max());
166  __syncthreads();
167  pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, cub::Max());
168  __syncthreads();
169  pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, cub::Max());
170  __syncthreads();
171 
172  if (threadIdx.x == 0) {
173  s_pmin = pmin;
174  s_pmax = pmax;
175  }
176  __syncthreads();
177 
178  pmin = s_pmin;
179  pmax = s_pmax;
180 
181 
182  unsigned int thread_start[kValuesPerThread];
183  unsigned int thread_end[kValuesPerThread];
184  unsigned int thread_out[kValuesPerThread];
185  unsigned int thread_keys[kValuesPerThread];
186  unsigned int thread_values[kValuesPerThread];
187 
188  for (int i = 0; i < kValuesPerThread; i++) {
189  thread_out[i] = 0;
190  thread_start[i] = 0;
191  thread_end[i] = numAtoms;
192  }
193 
194  double diff_x = pmax.x - pmin.x;
195  double diff_y = pmax.y - pmin.y;
196  double diff_z = pmax.z - pmin.z;
197  int bit = 0;
198  const int num_iters = 3;
199  for (int i = 0; i < num_iters; i++) {
200  if (diff_x > diff_y && diff_y > diff_z) {
201  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
202  thread_start, thread_end, thread_out, thread_keys, thread_values);
203  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
204  thread_start, thread_end, thread_out, thread_keys, thread_values);
205  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
206  thread_start, thread_end, thread_out, thread_keys, thread_values);
207  } else if (diff_x > diff_z && diff_z > diff_y) {
208  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
209  thread_start, thread_end, thread_out, thread_keys, thread_values);
210  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
211  thread_start, thread_end, thread_out, thread_keys, thread_values);
212  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
213  thread_start, thread_end, thread_out, thread_keys, thread_values);
214  } else if (diff_y > diff_x && diff_x > diff_z) {
215  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
216  thread_start, thread_end, thread_out, thread_keys, thread_values);
217  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
218  thread_start, thread_end, thread_out, thread_keys, thread_values);
219  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
220  thread_start, thread_end, thread_out, thread_keys, thread_values);
221  } else if (diff_y > diff_z && diff_z > diff_x) {
222  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
223  thread_start, thread_end, thread_out, thread_keys, thread_values);
224  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
225  thread_start, thread_end, thread_out, thread_keys, thread_values);
226  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
227  thread_start, thread_end, thread_out, thread_keys, thread_values);
228  } else if (diff_z > diff_x && diff_x > diff_y) {
229  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
230  thread_start, thread_end, thread_out, thread_keys, thread_values);
231  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
232  thread_start, thread_end, thread_out, thread_keys, thread_values);
233  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
234  thread_start, thread_end, thread_out, thread_keys, thread_values);
235  } else {
236  singleBisect(bit++, num_iters * 3, pmin.z, pmax.z, pos_z + offset, numAtoms, s_indexBuffer,
237  thread_start, thread_end, thread_out, thread_keys, thread_values);
238  singleBisect(bit++, num_iters * 3, pmin.y, pmax.y, pos_y + offset, numAtoms, s_indexBuffer,
239  thread_start, thread_end, thread_out, thread_keys, thread_values);
240  singleBisect(bit++, num_iters * 3, pmin.x, pmax.x, pos_x + offset, numAtoms, s_indexBuffer,
241  thread_start, thread_end, thread_out, thread_keys, thread_values);
242  }
243  }
244 
245  for (int i = 0; i < kValuesPerThread; i++) {
246  const int idx = kValuesPerThread * threadIdx.x + i;
247  if (idx < numAtoms) {
248  sortIndex[offset + idx] = thread_out[i];
249  sortOrder[offset + idx] = idx;
250  }
251  }
252  __syncthreads();
253 
254  }
255 }
256 
257 
258 __device__ __forceinline__ int simple_grid(const int numGrids, const int x, const int y, const int z) {
259  const int index = x + numGrids * (y + numGrids * z);
260  return index;
261 }
262 
263 __device__ __forceinline__ int snake_grid(const int numGrids, const int x, const int y, const int z) {
264  int index = numGrids * numGrids * x;
265  if (x % 2 == 0) {
266  index += numGrids * y;
267  } else {
268  index += numGrids * (numGrids - 1 - y);
269  }
270  if (y % 2 == x % 2) {
271  index += z;
272  } else {
273  index += (numGrids - 1 - z);
274  }
275  return index;
276 }
277 
278 
279 /**
280  * \brief Computes the nonbonded index of atoms for optimal clustering
281  *
282  * \par
283  * Each thread-block is assigned to a patch, and does the following:
284  * - First, the minimum and maximum coordinates of the patch are computed.
285  * the patchMin and patchMax don't produce the same results. I'm not sure
286  * if this is because of migration coords or something with the lattice...
287  * - Seconds, the nonbonded index of each atom is computed and stored
288  * The exact spatial hashing algorithm needs investigation
289  *
290  */
291 __global__ void computeNonbondedIndex(
292  const int numPatches,
293  const int numGrids,
294  const CudaLocalRecord* localRecords,
295  const double3* patchMin,
296  const double3* patchMax,
297  const double* pos_x,
298  const double* pos_y,
299  const double* pos_z,
300  int* sortOrder,
301  int* sortIndex
302 ) {
303  __shared__ CudaLocalRecord s_record;
304  using AccessType = int32_t;
305  AccessType* s_record_buffer = (AccessType*) &s_record;
306 
307  typedef cub::BlockReduce<double, MigrationCUDAKernel::kSortNumThreads> BlockReduce;
308  __shared__ typename BlockReduce::TempStorage temp_storage;
309 
310  __shared__ double3 s_pmin;
311  __shared__ double3 s_pmax;
312 
313  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
314  #pragma unroll
315  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
316  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
317  }
318  __syncthreads();
319 
320  const int numAtoms = s_record.numAtoms;
321  const int offset = s_record.bufferOffset;
322 
323  // Compute the thread-local min/max values
324  double3 pmin, pmax;
325  pmin.x = BIG_FLOAT;
326  pmin.y = BIG_FLOAT;
327  pmin.z = BIG_FLOAT;
328  pmax.x = SMALL_FLOAT;
329  pmax.y = SMALL_FLOAT;
330  pmax.z = SMALL_FLOAT;
331  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
332  pmin.x = fmin(pmin.x, pos_x[offset + i]);
333  pmin.y = fmin(pmin.y, pos_y[offset + i]);
334  pmin.z = fmin(pmin.z, pos_z[offset + i]);
335  pmax.x = fmax(pmax.x, pos_x[offset + i]);
336  pmax.y = fmax(pmax.y, pos_y[offset + i]);
337  pmax.z = fmax(pmax.z, pos_z[offset + i]);
338  }
339  __syncthreads();
340 
341  // Compute the thread-block min/max values
342  pmin.x = BlockReduce(temp_storage).Reduce(pmin.x, cub::Min());
343  __syncthreads();
344  pmin.y = BlockReduce(temp_storage).Reduce(pmin.y, cub::Min());
345  __syncthreads();
346  pmin.z = BlockReduce(temp_storage).Reduce(pmin.z, cub::Min());
347  __syncthreads();
348 
349  pmax.x = BlockReduce(temp_storage).Reduce(pmax.x, cub::Max());
350  __syncthreads();
351  pmax.y = BlockReduce(temp_storage).Reduce(pmax.y, cub::Max());
352  __syncthreads();
353  pmax.z = BlockReduce(temp_storage).Reduce(pmax.z, cub::Max());
354  __syncthreads();
355 
356  if (threadIdx.x == 0) {
357  s_pmin = pmin;
358  s_pmax = pmax;
359  }
360  __syncthreads();
361 
362  pmin = s_pmin;
363  pmax = s_pmax;
364 
365  // Compute the sort index
366  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
367  const double x = pos_x[offset + i];
368  const double y = pos_y[offset + i];
369  const double z = pos_z[offset + i];
370 
371  // Compute subpatch index
372  int idx_x = (int)((x - pmin.x) / (pmax.x - pmin.x) * ((double) numGrids));
373  int idx_y = (int)((y - pmin.y) / (pmax.y - pmin.y) * ((double) numGrids));
374  int idx_z = (int)((z - pmin.z) / (pmax.z - pmin.z) * ((double) numGrids));
375  idx_x = min(max(idx_x, 0), numGrids-1);
376  idx_y = min(max(idx_y, 0), numGrids-1);
377  idx_z = min(max(idx_z, 0), numGrids-1);
378 
379  // Compute sort index
380  const int block_index = snake_grid(numGrids, idx_x, idx_y, idx_z);
381  const double z_norm = (z - pmin.z) / (pmax.z - pmin.z);
382 
383  const int reverse = ((idx_y % 2) == (idx_x % 2));
384  int inner_index;
385  if (reverse) {
386  inner_index = (unsigned int) (z_norm * (1 << 16));
387  } else {
388  inner_index = ~((unsigned int) (z_norm * (1 << 16)));
389  }
390 
391  sortIndex[offset + i] = (block_index << 17) + inner_index;
392  sortOrder[offset + i] = i;
393  }
394  __syncthreads();
395  }
396 }
397 
398 /**
399  * \brief Sorts the nonbonded sorting based on previously computed indices
400  *
401  * /par
402  * Uses CUB's block-level sort algorithm to generate the nonbonded ordering based on
403  * previously computed spatial hashing. It will generate both a forward and backward maping
404  *
405  */
406 __global__ void sortAtomsKernel(
407  const int numPatches,
408  const CudaLocalRecord* localRecords,
409  int* sortOrder,
410  int* unsortOrder,
411  int* sortIndex
412 ) {
413  __shared__ CudaLocalRecord s_record;
414  using AccessType = int32_t;
415  AccessType* s_record_buffer = (AccessType*) &s_record;
416 
417  typedef cub::BlockRadixSort<int, MigrationCUDAKernel::kSortNumThreads, MigrationCUDAKernel::kValuesPerThread, int> BlockRadixSort;
418  __shared__ typename BlockRadixSort::TempStorage temp_storage;
419 
420  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
421  // Read in the CudaLocalRecord using multiple threads. This should
422  #pragma unroll 1
423  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
424  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
425  }
426  __syncthreads();
427 
428  const int numAtoms = s_record.numAtoms;
429  const int offset = s_record.bufferOffset;
430 
431  int thread_keys[MigrationCUDAKernel::kValuesPerThread];
432  int thread_values[MigrationCUDAKernel::kValuesPerThread];
433  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
434  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
435  if (idx < numAtoms) {
436  thread_keys[i] = sortIndex[offset + idx];
437  thread_values[i] = sortOrder[offset + idx];
438  } else {
439  thread_keys[i] = MAX_VALUE;
440  thread_values[i] = -1;
441  }
442  }
443  __syncthreads();
444 
445  BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
446  __syncthreads();
447 
448  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
449  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
450  if (thread_keys[i] != MAX_VALUE) {
451  unsortOrder[offset + thread_values[i]] = idx;
452  sortOrder[offset + idx] = thread_values[i];
453  sortIndex[offset + idx] = thread_keys[i];
454  }
455  }
456  __syncthreads();
457  }
458 }
459 
460 
461 __global__ void printSortOrder(
462  const int numPatches,
463  const CudaLocalRecord* localRecords,
464  int* sortOrder,
465  const double* pos_x,
466  const double* pos_y,
467  const double* pos_z
468 ) {
469  __shared__ CudaLocalRecord s_record;
470  using AccessType = int32_t;
471  AccessType* s_record_buffer = (AccessType*) &s_record;
472 
473  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
474  // Read in the CudaLocalRecord using multiple threads. This should
475  #pragma unroll 1
476  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
477  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
478  }
479  __syncthreads();
480 
481  const int numAtoms = s_record.numAtoms;
482  const int offset = s_record.bufferOffset;
483 
484  if (patchIndex == 0) {
485  if (threadIdx.x == 0) {
486  for (int i = 0; i < numAtoms; i ++) {
487  printf("%d %d %f %f %f\n", i, sortOrder[offset + i], pos_x[offset + i], pos_y[offset+i], pos_z[offset + i]);
488  }
489  }
490  }
491  __syncthreads();
492  }
493 }
494 
495 void MigrationCUDAKernel::sortAtoms(
496  const int numPatches,
497  const int numAtoms,
498  const CudaLocalRecord* records,
499  const double3* patchMin,
500  const double3* patchMax,
501  const double* pos_x,
502  const double* pos_y,
503  const double* pos_z,
504  int* sortOrder,
505  int* unsortOrder,
506  int* sortIndex,
507  cudaStream_t stream
508 ) {
509  constexpr int numThreads = kSortNumThreads;
510  const int numBlocks = numPatches;
511 
512  // Knob for the spatial hashing
513  const int numGrid = 4;
514 
515  computeBisect<<<numBlocks, numThreads, 0, stream>>>(
516 // computeNonbondedIndex<<<numBlocks, numThreads, 0, stream>>>(
517  numPatches,
518  numGrid,
519  records,
520  patchMin,
521  patchMax,
522  pos_x,
523  pos_y,
524  pos_z,
525  sortOrder,
526  sortIndex
527  );
528 
529  sortAtomsKernel<<<numBlocks, numThreads, 0, stream>>>(
530  numPatches,
531  records,
532  sortOrder,
533  unsortOrder,
534  sortIndex
535  );
536 
537 #if 0
538  printSortOrder<<<numBlocks, numThreads, 0, stream>>>(
539  numPatches,
540  records,
541  sortOrder,
542  pos_x,
543  pos_y,
544  pos_z
545  );
546 #endif
547 }
548 
549 
550 /**
551  * \brief Computes the derived quantities for SoA data
552  *
553  * \par
554  * Some data isn't stored in AoS data. This kernel computes quantities that are
555  * used elsewhere in calculation like reciprocal mass
556  *
557  */
558 __global__ void calculateDerivedKernel(
559  const int numPatches,
560  const CudaLocalRecord* localRecords,
561  float* mass,
562  double* recipMass
563 ) {
564  __shared__ CudaLocalRecord s_record;
565  using AccessType = int32_t;
566  AccessType* s_record_buffer = (AccessType*) &s_record;
567 
568  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
569  // Read in the CudaLocalRecord using multiple threads. This should
570  #pragma unroll 1
571  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
572  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
573  }
574  __syncthreads();
575 
576  const int numAtoms = s_record.numAtoms;
577  const int offset = s_record.bufferOffset;
578 
579  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
580  const float m = mass[offset + i];
581  recipMass[offset + i] = (m > 0) ? 1.0f / m : 0.0;
582  }
583  __syncthreads();
584  }
585 }
586 
587 /**
588  * \brief Computes the Langevin derived quantities for SoA data
589  *
590  * \par
591  * Some data isn't stored in AoS data. This kernel computes quantities that are
592  * used elsewhere in the Langevin calculation like langScalRandBBK2
593  *
594  */
595 template<bool kDoAlch>
596 __global__ void calculateDerivedLangevinKernel(
597  const int numPatches,
598  const CudaLocalRecord* localRecords,
599  const double dt,
600  const double kbT,
601  const double tempFactor,
602  double* recipMass,
603  int* partition,
604  float* langevinParam,
605  float* langScalVelBBK2,
606  float* langScalRandBBK2
607 ) {
608  __shared__ CudaLocalRecord s_record;
609  using AccessType = int32_t;
610  AccessType* s_record_buffer = (AccessType*) &s_record;
611 
612  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
613  // Read in the CudaLocalRecord using multiple threads. This should
614  #pragma unroll 1
615  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
616  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
617  }
618  __syncthreads();
619 
620  const int numAtoms = s_record.numAtoms;
621  const int offset = s_record.bufferOffset;
622 
623  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
624  const double dt_gamma = dt * langevinParam[offset + i];
625  const float invm = recipMass[offset + i];
626  const double factor = (kDoAlch && partition[offset + i]) ? tempFactor : 1.0;
627  langScalRandBBK2[offset + i] = (float) sqrt( 2.0 * dt_gamma * kbT * factor * invm);
628  langScalVelBBK2[offset + i] = (float) (1.0 / (1.0 + 0.5 * dt_gamma));
629  }
630  __syncthreads();
631  }
632 }
633 
634 
635 
636 /**
637  * \brief Copies AoS data into SoA buffers
638  *
639  * \par
640  * During migration, atomic data is stored as AoS. After migration, we must copy
641  * the data to SoA buffers for the rest of the calculation. Since the AoS atomic data
642  * is 128 bytes, we can do this with efficient memory access using shared memory as a
643  * buffer for the transpose
644  *
645  * TODO Remove ugly if statement for writing buffers
646  *
647  */
648 template<bool kDoAlch, bool kDoLangevin>
649 __global__ void copy_AoS_to_SoAKernel(
650  const int numPatches,
651  const CudaLocalRecord* localRecords,
652  const FullAtom* atomdata_AoS,
653  double* vel_x,
654  double* vel_y,
655  double* vel_z,
656  double* pos_x,
657  double* pos_y,
658  double* pos_z,
659  float* mass,
660  float* charge,
661  int* id,
662  int* vdwType,
663  int* hydrogenGroupSize,
664  int* migrationGroupSize,
665  int* atomFixed,
666  float* rigidBondLength,
667  char3* transform,
668  int* partition,
669  float* langevinParam
670 ) {
671  constexpr int kAtomsPerBuffer = MigrationCUDAKernel::kAtomsPerBuffer;
672  constexpr int kNumSoABuffers = MigrationCUDAKernel::kNumSoABuffers;
673 
674  __shared__ CudaLocalRecord s_record;
675  using AccessType = int32_t;
676  AccessType* s_record_buffer = (AccessType*) &s_record;
677 
678  // FullAtom contains "Vectors" which initialize to 99999. This isn't allow for shared memory
679  // suppressing this warning as it doesn't rely on the dynamic initialization
680 #ifdef NAMD_CUDA
681  #pragma diag_suppress static_var_with_dynamic_init
682  __shared__ FullAtom s_atombuffer[kAtomsPerBuffer]; // For a 128 byte cache line
683 #else // NAMD_HIP
684  // NVCC might allow the code above but initializations on shared memory
685  // are not allowed on clang
686  extern __shared__ FullAtom s_atombuffer[];
687 #endif
688 
689  const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
690  const int wid = threadIdx.x / WARPSIZE;
691  const int tid = threadIdx.x % WARPSIZE;
692 
693  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
694  // Read in the CudaLocalRecord using multiple threads. This should
695  #pragma unroll 1
696  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
697  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
698  }
699  __syncthreads();
700 
701  const int numAtoms = s_record.numAtoms;
702  const int numAtomsPad = (s_record.numAtoms + kAtomsPerBuffer - 1) / kAtomsPerBuffer;
703  const int offset = s_record.bufferOffset;
704 
705  for (int i = 0; i < numAtomsPad; i++) {
706  const int atomOffset = i * kAtomsPerBuffer;
707  // Load atoms. 1 warp per atom
708  for (int atom_idx = wid; atom_idx < kAtomsPerBuffer; atom_idx += warps_per_threadblock) {
709  if (atomOffset + atom_idx < numAtoms) {
710  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
711  int32_t *s_int = (int32_t*) &(s_atombuffer[atom_idx]);
712  int32_t *g_int = (int32_t*) &(atomdata_AoS[offset + atomOffset + atom_idx]);
713  s_int[tid] = g_int[tid];
714  }
715  }
716  }
717  __syncthreads();
718 
719  // Store atoms in SoA. 1 thread per atom
720  for (int buffer = wid; buffer < kNumSoABuffers; buffer += warps_per_threadblock) {
721  if (atomOffset + tid < numAtoms) {
722  if (buffer == 0) vel_x [offset + atomOffset + tid] = ((FullAtom)(s_atombuffer[tid])).velocity.x;
723  if (buffer == 1) vel_y [offset + atomOffset + tid] = s_atombuffer[tid].velocity.y;
724  if (buffer == 2) vel_z [offset + atomOffset + tid] = s_atombuffer[tid].velocity.z;
725  if (buffer == 3) pos_x [offset + atomOffset + tid] = s_atombuffer[tid].position.x;
726  if (buffer == 4) pos_y [offset + atomOffset + tid] = s_atombuffer[tid].position.y;
727  if (buffer == 5) pos_z [offset + atomOffset + tid] = s_atombuffer[tid].position.z;
728  if (buffer == 6) mass [offset + atomOffset + tid] = s_atombuffer[tid].mass;
729  if (buffer == 7) charge [offset + atomOffset + tid] = s_atombuffer[tid].charge;
730  if (buffer == 8) id [offset + atomOffset + tid] = s_atombuffer[tid].id;
731  if (buffer == 9) vdwType [offset + atomOffset + tid] = s_atombuffer[tid].vdwType;
732  if (buffer == 10) hydrogenGroupSize [offset + atomOffset + tid] = s_atombuffer[tid].hydrogenGroupSize;
733  if (buffer == 11) migrationGroupSize[offset + atomOffset + tid] = s_atombuffer[tid].migrationGroupSize;
734  if (buffer == 12) atomFixed [offset + atomOffset + tid] = s_atombuffer[tid].atomFixed;
735  if (buffer == 13) rigidBondLength [offset + atomOffset + tid] = s_atombuffer[tid].rigidBondLength;
736  if (buffer == 14) transform [offset + atomOffset + tid] = s_atombuffer[tid].transform;
737  if (kDoAlch &&
738  buffer == 15) partition [offset + atomOffset + tid] = s_atombuffer[tid].partition;
739  if (kDoLangevin &&
740  buffer == 16) langevinParam [offset + atomOffset + tid] = s_atombuffer[tid].langevinParam;
741  }
742  }
743  __syncthreads();
744  }
745  __syncthreads();
746  }
747 }
748 
749 
750 void MigrationCUDAKernel::copy_AoS_to_SoA(
751  const int numPatches,
752  const bool alchOn,
753  const bool langevinOn,
754  const double dt,
755  const double kbT,
756  const double tempFactor,
757  const CudaLocalRecord* records,
758  const FullAtom* atomdata_AoS,
759  double* recipMass,
760  double* vel_x,
761  double* vel_y,
762  double* vel_z,
763  double* pos_x,
764  double* pos_y,
765  double* pos_z,
766  float* mass,
767  float* charge,
768  int* id,
769  int* vdwType,
770  int* hydrogenGroupSize,
771  int* migrationGroupSize,
772  int* atomFixed,
773  float* rigidBondLength,
774  char3* transform,
775  int* partition,
776  float* langevinParam,
777  float* langScalVelBBK2,
778  float* langScalRandBBK2,
779  cudaStream_t stream
780 ) {
781  constexpr int numThreads = kSortNumThreads;
782  const int numBlocks = numPatches;
783 
784 #ifdef NAMD_CUDA
785  constexpr size_t sizeatoms = 0;
786 #else
787  constexpr size_t sizeatoms = MigrationCUDAKernel::kAtomsPerBuffer*sizeof(FullAtom);
788 #endif
789 
790  #define CALL(alchOn, langevinOn) do { \
791  if (numBlocks) { \
792  copy_AoS_to_SoAKernel<alchOn, langevinOn><<<numBlocks, numThreads, sizeatoms, stream>>>( \
793  numPatches, \
794  records, \
795  atomdata_AoS, \
796  vel_x, vel_y, vel_z, \
797  pos_x, pos_y, pos_z, \
798  mass, charge, \
799  id, vdwType, \
800  hydrogenGroupSize, \
801  migrationGroupSize, \
802  atomFixed, \
803  rigidBondLength, \
804  transform, \
805  partition, \
806  langevinParam \
807  ); \
808  } \
809  } while (0);
810 
811  if (alchOn && langevinOn) {
812  CALL(true, true);
813  } else if (!alchOn && langevinOn) {
814  CALL(false, true);
815  } else if (alchOn && !langevinOn) {
816  CALL(true, false);
817  } else {
818  CALL(false, false);
819  }
820 
821  #undef CALL
822 
823  calculateDerivedKernel<<<numBlocks, numThreads, 0, stream>>>(
824  numPatches,
825  records,
826  mass,
827  recipMass
828  );
829 
830  // This needs the recipMass
831  if (langevinOn) {
832  if (alchOn) {
833  calculateDerivedLangevinKernel<true><<<numBlocks, numThreads, 0, stream>>>(
834  numPatches,
835  records,
836  dt, kbT, tempFactor,
837  recipMass,
838  partition,
839  langevinParam,
840  langScalVelBBK2,
841  langScalRandBBK2
842  );
843  } else {
844  calculateDerivedLangevinKernel<false><<<numBlocks, numThreads, 0, stream>>>(
845  numPatches,
846  records,
847  dt, kbT, tempFactor,
848  recipMass,
849  partition,
850  langevinParam,
851  langScalVelBBK2,
852  langScalRandBBK2
853  );
854  }
855  }
856 }
857 
858 
859 /**
860  * \brief Computes the solvent index with the patch
861  *
862  * \par
863  * Within a patch, the atoms must be sorted such that solvent atoms are at the end of patch. This sorting
864  * must be sorting.
865  *
866  * We do this by using a scan to compute the index for the solute atoms. And then another scan to compute
867  * the index for solvent atoms.
868  *
869  */
870 __global__
871 void computeSolventIndexKernel(
872  const int numPatches,
873  const CudaLocalRecord* localRecords,
874  const FullAtom* atomdata_AoS_in,
875  int* sortIndex
876 ) {
877  __shared__ CudaLocalRecord s_record;
878  using AccessType = int32_t;
879  AccessType* s_record_buffer = (AccessType*) &s_record;
880 
881  typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
882  __shared__ typename BlockScan::TempStorage temp_storage;
883 
884  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
885  // Read in the CudaLocalRecord using multiple threads. This should
886  #pragma unroll 1
887  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
888  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
889  }
890  __syncthreads();
891 
892  const int numAtoms = s_record.numAtoms;
893  const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
894 
895  int thread_input[MigrationCUDAKernel::kValuesPerThread];
896  int thread_soluteIndex[MigrationCUDAKernel::kValuesPerThread];
897  int thread_solventIndex[MigrationCUDAKernel::kValuesPerThread];
898  int numSolute;
899 
900  // Load the isWater value
901  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
902  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
903  if (idx < numAtoms) {
904  thread_input[i] = 1 - atomdata_AoS_in[offsetIn + idx].isWater;
905  } else {
906  thread_input[i] = 0;
907  }
908  }
909  __syncthreads();
910 
911  // Compute the prefix sum of solvent
912  BlockScan(temp_storage).ExclusiveSum(thread_input, thread_soluteIndex, numSolute);
913  __syncthreads();
914 
915  // Flip input data
916  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
917  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
918  if (idx < numAtoms) {
919  thread_input[i] = 1 - thread_input[i];
920  } else {
921  thread_input[i] = 0;
922  }
923  }
924  __syncthreads();
925 
926  // Compute the prefix sum of water
927  BlockScan(temp_storage).ExclusiveSum(thread_input, thread_solventIndex);
928  __syncthreads();
929 
930  // write index
931  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
932  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
933  if (idx < numAtoms) {
934  if (thread_input[i]) {
935  sortIndex[offsetIn + idx] = numSolute + thread_solventIndex[i];
936  } else {
937  sortIndex[offsetIn + idx] = thread_soluteIndex[i];
938  }
939  }
940  }
941  __syncthreads();
942  }
943 }
944 
945 /**
946  * \brief Sorts atoms into solute-solvent ordering based on previous compute indices
947  *
948  * \par
949  * Using the previously computed indices, this kernel moves the atomic data in AoS format
950  * into the desired order. Note, this kernel copies the atomic data from the scratch buffer
951  * into the true atomic AoS buffer. The actual migration data movement copies into the scratch
952  * buffer and this kernel copies back.
953  *
954  * Since the AoS data is ~128 bytes per atom. Each warp will move a single atom
955  *
956  */
957 __global__ void sortSolventKernel(
958  const int numPatches,
959  const CudaLocalRecord* localRecords,
960  const FullAtom* atomdata_AoS_in,
961  FullAtom* atomdata_AoS_out,
962  const int* sortIndex
963 ) {
964  __shared__ CudaLocalRecord s_record;
965  using AccessType = int32_t;
966  AccessType* s_record_buffer = (AccessType*) &s_record;
967 
968  const int warps_per_threadblock = MigrationCUDAKernel::kSortNumThreads / WARPSIZE;
969  const int wid = threadIdx.x / WARPSIZE;
970  const int tid = threadIdx.x % WARPSIZE;
971 
972  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
973  // Read in the CudaLocalRecord using multiple threads. This should
974  #pragma unroll 1
975  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
976  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
977  }
978  __syncthreads();
979 
980  const int numAtoms = s_record.numAtoms;
981 
982  // This was causing issues with CUDA11.3. Needed to explicitly make the offset
983  // 64-bit
984 #if 0
985  const int64_t offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
986  const int64_t offset = s_record.bufferOffset;
987 #else
988  const int offsetIn = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
989  const int offset = s_record.bufferOffset;
990 #endif
991 
992  for (int i = wid; i < numAtoms; i+=warps_per_threadblock) {
993  const int dst_idx = sortIndex[offsetIn + i];
994  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
995  int32_t *src_int = (int32_t*) &(atomdata_AoS_in[offsetIn + i]);
996  int32_t *dst_int = (int32_t*) &(atomdata_AoS_out[offset + dst_idx]);
997  dst_int[tid] = src_int[tid];
998  }
999 #if defined(NAMD_HIP)
1000  NAMD_WARP_SYNC(WARP_FULL_MASK);
1001 #else
1002  WARP_SYNC(WARP_FULL_MASK);
1003 #endif
1004  }
1005 
1006  __syncthreads();
1007  }
1008 }
1009 
1010 void MigrationCUDAKernel::sortSolventAtoms(
1011  const int numPatches,
1012  const CudaLocalRecord* records,
1013  const FullAtom* atomdata_AoS_in,
1014  FullAtom* atomdata_AoS_out,
1015  int* sortIndex,
1016  cudaStream_t stream
1017 ) {
1018  constexpr int numThreads = kSortNumThreads;
1019  const int numBlocks = numPatches;
1020 
1021  computeSolventIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1022  numPatches,
1023  records,
1024  atomdata_AoS_in,
1025  sortIndex
1026  );
1027 
1028  sortSolventKernel<<<numBlocks, numThreads, 0, stream>>>(
1029  numPatches,
1030  records,
1031  atomdata_AoS_in,
1032  atomdata_AoS_out,
1033  sortIndex
1034  );
1035 }
1036 
1037 /**
1038  * \brief Computes the migration group index
1039  *
1040  * \par
1041  * Atom migration must occur at the level of migration group. Some molecules are
1042  * moved together. I.e. hydrogen of a water molecular are moved based on oxygen's position
1043  *
1044  * This kernel computes the number of migration groups per patch as well as their starting
1045  * indices. This will be used later during migration. It does this with a scan operation
1046  *
1047  */
1048 __global__ void computeMigrationGroupIndexKernel(
1049  const int numPatches,
1050  CudaLocalRecord* localRecords,
1051  const int* migrationGroupSize,
1052  int* migrationGroupIndex
1053 ) {
1054  __shared__ CudaLocalRecord s_record;
1055  using AccessType = int32_t;
1056  AccessType* s_record_buffer = (AccessType*) &s_record;
1057 
1058  typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1059  __shared__ typename BlockScan::TempStorage temp_storage;
1060 
1061  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1062  // Read in the CudaLocalRecord using multiple threads. This should
1063  #pragma unroll 1
1064  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1065  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1066  }
1067  __syncthreads();
1068 
1069  const int numAtoms = s_record.numAtoms;
1070  const int offset = s_record.bufferOffset;
1071 
1072  int thread_size[MigrationCUDAKernel::kValuesPerThread];
1073  int thread_index[MigrationCUDAKernel::kValuesPerThread];
1074  int numGroups;
1075 
1076  // Load the migration group size
1077  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1078  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1079  if (idx < numAtoms) {
1080  thread_size[i] = migrationGroupSize[offset + idx] ? 1 : 0;
1081  } else {
1082  thread_size[i] = 0;
1083  }
1084  }
1085  __syncthreads();
1086 
1087  // Compute the prefix sum of solvent
1088  BlockScan(temp_storage).ExclusiveSum(thread_size, thread_index, numGroups);
1089  __syncthreads();
1090 
1091  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1092  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1093  if (thread_size[i] != 0) {
1094  migrationGroupIndex[offset + thread_index[i]] = idx;
1095  }
1096  }
1097  __syncthreads();
1098 
1099  if (threadIdx.x == 0) {
1100  localRecords[patchIndex].numMigrationGroups = numGroups;
1101  }
1102  __syncthreads();
1103  }
1104 }
1105 
1106 
1107 void MigrationCUDAKernel::computeMigrationGroupIndex(
1108  const int numPatches,
1109  CudaLocalRecord* records,
1110  const int* migrationGroupSize,
1111  int* migrationGroupIndex,
1112  cudaStream_t stream
1113 ) {
1114  constexpr int numThreads = kSortNumThreads;
1115  const int numBlocks = numPatches;
1116 
1117  computeMigrationGroupIndexKernel<<<numBlocks, numThreads, 0, stream>>>(
1118  numPatches,
1119  records,
1120  migrationGroupSize,
1121  migrationGroupIndex
1122  );
1123 }
1124 
1125 /**
1126  * \brief Computes the transformed positions of migrated atoms
1127  *
1128  * \par
1129  * When atoms move between patches, we need to do a transform on the position based
1130  * on the lattice. This logic is largely copied from HomePatch.C. This is only done
1131  * on the atoms that have actually moved between patches. This uses the numAtomsLocal
1132  * field of the CudaLocalRecord
1133  *
1134  * Note this kernel could likely be optimized. Threads that are not assigned to the
1135  * parent of a migration group do no work. But not that many atoms actually migrate
1136  *
1137  */
1138 __global__ void transformMigratedPositionsKernel(
1139  const int numPatches,
1140  const CudaLocalRecord* localRecords,
1141  const double3* patchCenter,
1142  FullAtom* atomdata_AoS,
1143  const Lattice lattice
1144 ) {
1145  __shared__ CudaLocalRecord s_record;
1146  using AccessType = int32_t;
1147  AccessType* s_record_buffer = (AccessType*) &s_record;
1148 
1149  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1150  // Read in the CudaLocalRecord using multiple threads. This should
1151  #pragma unroll 1
1152  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1153  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1154  }
1155  __syncthreads();
1156 
1157  const int offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1158  const int numAtoms = s_record.numAtoms;
1159  const int numAtomsLocal = s_record.numAtomsLocal;
1160  const int numAtomsMigrated = numAtoms - numAtomsLocal;
1161  const double3 center = patchCenter[patchIndex];
1162 
1163  for (int i = threadIdx.x; i < numAtomsMigrated; i += blockDim.x) {
1164  const int startingIndex = numAtomsLocal + i;
1165  const int migrationSize = atomdata_AoS[offset + startingIndex].migrationGroupSize;
1166  const int hydrogenSize = atomdata_AoS[offset + startingIndex].hydrogenGroupSize;
1167 
1168  if (migrationSize == 0) continue;
1169 
1170  Transform parent_transform;
1171  if (migrationSize != hydrogenSize) {
1172  double3 c_pos = make_double3(0.0, 0.0, 0.0);
1173  int c = 0;
1174  int tmp_hydrogenGroupSize = hydrogenSize;
1175  for (int j = 0; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1176  c_pos = c_pos + (double3) atomdata_AoS[offset + startingIndex + j].position;
1177  c++;
1178  tmp_hydrogenGroupSize = atomdata_AoS[offset + startingIndex + j].hydrogenGroupSize;
1179  }
1180  c_pos.x = c_pos.x / ((double) c);
1181  c_pos.y = c_pos.y / ((double) c);
1182  c_pos.z = c_pos.z / ((double) c);
1183 
1184  parent_transform = atomdata_AoS[offset+startingIndex].transform;
1185  c_pos = lattice.nearest(c_pos, center, &parent_transform);
1186 
1187  double3 pos = atomdata_AoS[offset+startingIndex].position;
1188  Transform transform = atomdata_AoS[offset+startingIndex].transform;
1189 
1190  pos = lattice.reverse_transform(pos, transform);
1191  pos = lattice.apply_transform(pos, parent_transform);
1192 
1193  atomdata_AoS[offset+startingIndex].transform = parent_transform;
1194  atomdata_AoS[offset+startingIndex].position = pos;
1195  } else {
1196 
1197  double3 pos = atomdata_AoS[offset+startingIndex].position;
1198  Transform transform = atomdata_AoS[offset+startingIndex].transform;
1199 
1200  pos = lattice.nearest(pos, center, &transform);
1201  parent_transform = transform;
1202 
1203  atomdata_AoS[offset+startingIndex].transform = transform;
1204  atomdata_AoS[offset+startingIndex].position = pos;
1205  }
1206  for (int j = 1; j < migrationSize; j++) {
1207  double3 pos = atomdata_AoS[offset+startingIndex + j].position;
1208  Transform transform = atomdata_AoS[offset+startingIndex +j].transform;
1209 
1210  pos = lattice.reverse_transform(pos, transform);
1211  pos = lattice.apply_transform(pos, parent_transform);
1212 
1213  atomdata_AoS[offset+startingIndex+j].transform = parent_transform;
1214  atomdata_AoS[offset+startingIndex+j].position = pos;
1215  }
1216  }
1217  __syncthreads();
1218  }
1219 }
1220 
1221 void MigrationCUDAKernel::transformMigratedPositions(
1222  const int numPatches,
1223  const CudaLocalRecord* localRecords,
1224  const double3* patchCenter,
1225  FullAtom* atomdata_AoS,
1226  const Lattice lattice,
1227  cudaStream_t stream
1228 ) {
1229  constexpr int numThreads = kSortNumThreads;
1230  const int numBlocks = numPatches;
1231 
1232  transformMigratedPositionsKernel<<<numBlocks, numThreads, 0, stream>>>(
1233  numPatches,
1234  localRecords,
1235  patchCenter,
1236  atomdata_AoS,
1237  lattice
1238  );
1239 }
1240 
1241 
1242 /**
1243  * \brief Computes the new location of a migration group
1244  *
1245  * \par
1246  * Computes the migration destination for each migration group. This info includes:
1247  * - Destination patch ID
1248  * - Destination patch local index (this is the home patches index)
1249  * - Destination device
1250  *
1251  * While the information is computed at the migration group level, it is stored for each atom
1252  * in a migration group, so some of the later data movement can happen at the per atom level
1253  *
1254  * Note the migrationDestination structure will eventually include the atoms new index within a
1255  * patch, but that is determined later
1256  *
1257  */
1258 __global__ void
1259 computeMigrationDestinationKernel(
1260  const int numPatches,
1261  CudaLocalRecord* localRecords,
1262  const Lattice lattice,
1263  const CudaMInfo* mInfo,
1264  const int* patchToDeviceMap,
1265  const int* globalToLocalMap,
1266  const double3* patchMin,
1267  const double3* patchMax,
1268  const int* hydrogenGroupSize,
1269  const int* migrationGroupSize,
1270  const int* migrationGroupIndex,
1271  const double* pos_x,
1272  const double* pos_y,
1273  const double* pos_z,
1274  int4* migrationDestination
1275 ) {
1276  __shared__ CudaLocalRecord s_record;
1277  using AccessType = int32_t;
1278  AccessType* s_record_buffer = (AccessType*) &s_record;
1279 
1280  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1281  // Read in the CudaLocalRecord using multiple threads. This should
1282  #pragma unroll 1
1283  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1284  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1285  }
1286 
1287  // Clear migration atom count
1288  if (threadIdx.x == 0) {
1289  localRecords[patchIndex].numAtomsNew = 0;
1290  }
1291  __syncthreads();
1292 
1293  const int numMigrationGroups = s_record.numMigrationGroups;
1294  const int offset = s_record.bufferOffset;
1295  const double3 min = patchMin[patchIndex];
1296  const double3 max = patchMax[patchIndex];
1297  CudaMInfo migration_info = mInfo[patchIndex];
1298 
1299  int xdev, ydev, zdev;
1300 
1301  for (int i = threadIdx.x; i < numMigrationGroups; i += blockDim.x) {
1302 
1303  const int startingIndex = migrationGroupIndex[offset + i];
1304  const int migrationSize = migrationGroupSize[offset + startingIndex];
1305  const int hydrogenSize = hydrogenGroupSize[offset + startingIndex];
1306 
1307  double3 pos;
1308  pos.x = pos_x[offset + startingIndex];
1309  pos.y = pos_y[offset + startingIndex];
1310  pos.z = pos_z[offset + startingIndex];
1311 
1312  if (migrationSize != hydrogenSize) {
1313  int c = 1;
1314  int tmp_hydrogenGroupSize = 1;
1315  for (int j = hydrogenSize; j < migrationSize; j+=tmp_hydrogenGroupSize) {
1316  pos.x = pos.x + pos_x[offset + startingIndex + j];
1317  pos.y = pos.y + pos_y[offset + startingIndex + j];
1318  pos.z = pos.z + pos_z[offset + startingIndex + j];
1319  c++;
1320  tmp_hydrogenGroupSize = hydrogenGroupSize[offset + startingIndex + j];
1321  }
1322  pos.x = pos.x / ((double) c);
1323  pos.y = pos.y / ((double) c);
1324  pos.z = pos.z / ((double) c);
1325  }
1326 
1327  double3 s = lattice.scale(pos);
1328 
1329  if (s.x < min.x) xdev = 0;
1330  else if (max.x <= s.x) xdev = 2;
1331  else xdev = 1;
1332 
1333  if (s.y < min.y) ydev = 0;
1334  else if (max.y <= s.y) ydev = 2;
1335  else ydev = 1;
1336 
1337  if (s.z < min.z) zdev = 0;
1338  else if (max.z <= s.z) zdev = 2;
1339  else zdev = 1;
1340 
1341  int dest_patch = migration_info.destPatchID[xdev][ydev][zdev];
1342  int dest_device = patchToDeviceMap[dest_patch];
1343  int dest_localID = globalToLocalMap[dest_patch];
1344  int4 dest_info;
1345  dest_info.x = dest_patch;
1346  dest_info.y = dest_device;
1347  dest_info.z = dest_localID;
1348  dest_info.w = 0;
1349 
1350  for (int j = 0; j < migrationSize; j++) {
1351  migrationDestination[offset + startingIndex + j] = dest_info;
1352  }
1353  }
1354  __syncthreads();
1355  }
1356 }
1357 
1358 void MigrationCUDAKernel::computeMigrationDestination(
1359  const int numPatches,
1360  CudaLocalRecord* localRecords,
1361  const Lattice lattice,
1362  const CudaMInfo* mInfo,
1363  const int* patchToDeviceMap,
1364  const int* globalToLocalMap,
1365  const double3* patchMin,
1366  const double3* patchMax,
1367  const int* hydrogenGroupSize,
1368  const int* migrationGroupSize,
1369  const int* migrationGroupIndex,
1370  const double* pos_x,
1371  const double* pos_y,
1372  const double* pos_z,
1373  int4* migrationDestination,
1374  cudaStream_t stream
1375 ) {
1376  constexpr int numThreads = kSortNumThreads;
1377  const int numBlocks = numPatches;
1378 
1379  computeMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1380  numPatches,
1381  localRecords,
1382  lattice,
1383  mInfo,
1384  patchToDeviceMap,
1385  globalToLocalMap,
1386  patchMin,
1387  patchMax,
1388  hydrogenGroupSize,
1389  migrationGroupSize,
1390  migrationGroupIndex,
1391  pos_x, pos_y, pos_z,
1392  migrationDestination
1393  );
1394 }
1395 
1396 
1397 /**
1398  * \brief Updates AoS data structure before migration
1399  *
1400  * \par
1401  * Copies the necessary data from the SoA buffers to the AoS buffers
1402  * This is just the positions and velocities
1403  *
1404  * TODO optimize this kernel
1405  *
1406  */
1407 __global__ void update_AoSKernel(
1408  const int numPatches,
1409  const CudaLocalRecord* localRecords,
1410  FullAtom* atomdata_AoS,
1411  const double* vel_x,
1412  const double* vel_y,
1413  const double* vel_z,
1414  const double* pos_x,
1415  const double* pos_y,
1416  const double* pos_z
1417 ) {
1418  __shared__ CudaLocalRecord s_record;
1419  using AccessType = int32_t;
1420  AccessType* s_record_buffer = (AccessType*) &s_record;
1421 
1422  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1423  // Read in the CudaLocalRecord using multiple threads. This should
1424  #pragma unroll 1
1425  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1426  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1427  }
1428  __syncthreads();
1429 
1430  const int numAtoms = s_record.numAtoms;
1431  const int offset = s_record.bufferOffset;
1432 
1433  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1434  double3 pos;
1435  pos.x = pos_x[offset + i];
1436  pos.y = pos_y[offset + i];
1437  pos.z = pos_z[offset + i];
1438 
1439  double3 vel;
1440  vel.x = vel_x[offset + i];
1441  vel.y = vel_y[offset + i];
1442  vel.z = vel_z[offset + i];
1443 
1444  atomdata_AoS[offset + i].position = pos;
1445  atomdata_AoS[offset + i].velocity = vel;
1446  }
1447  __syncthreads();
1448  }
1449 }
1450 
1451 void MigrationCUDAKernel::update_AoS(
1452  const int numPatches,
1453  const CudaLocalRecord* records,
1454  FullAtom* atomdata_AoS,
1455  const double* vel_x,
1456  const double* vel_y,
1457  const double* vel_z,
1458  const double* pos_x,
1459  const double* pos_y,
1460  const double* pos_z,
1461  cudaStream_t stream
1462 ) {
1463  constexpr int numThreads = kSortNumThreads;
1464  const int numBlocks = numPatches;
1465 
1466  update_AoSKernel<<<numBlocks, numThreads, 0, stream>>>(
1467  numPatches,
1468  records,
1469  atomdata_AoS,
1470  vel_x, vel_y, vel_z,
1471  pos_x, pos_y, pos_z
1472  );
1473 }
1474 
1475 
1476 /**
1477  * \brief Copies atoms that aren't "moving" into the scratch buffers
1478  *
1479  * \par
1480  * This kernel will copy the atoms that are not moving into a new patch into that
1481  * patches scratch buffers. This uses a scan operation to eliminate the need for
1482  * atomic operations to compute atom's new location.
1483  *
1484  * This scan will not change the order of the atoms within a migration group
1485  *
1486  * Note this will set the localID of the atom in the migrationDestination
1487  * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1488  *
1489  */
1490 __global__ void performLocalMigrationKernel(
1491  const int numPatches,
1492  CudaLocalRecord* localRecords,
1493  const FullAtom* atomdata_AoS_in,
1494  FullAtom* atomdata_AoS_out,
1495  int4* migrationDestination
1496 ) {
1497  __shared__ CudaLocalRecord s_record;
1498  using AccessType = int32_t;
1499  AccessType* s_record_buffer = (AccessType*) &s_record;
1500 
1501  typedef cub::BlockScan<int, MigrationCUDAKernel::kSortNumThreads> BlockScan;
1502  __shared__ typename BlockScan::TempStorage temp_storage;
1503 
1504  __shared__ int s_local_index[MigrationCUDAKernel::kSortNumThreads * MigrationCUDAKernel::kValuesPerThread];
1505 
1506  const int warps_per_threadblock = blockDim.x / WARPSIZE;
1507  const int wid = threadIdx.x / WARPSIZE;
1508  const int tid = threadIdx.x % WARPSIZE;
1509 
1510  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1511  // Read in the CudaLocalRecord using multiple threads. This should
1512  #pragma unroll 1
1513  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1514  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1515  }
1516  __syncthreads();
1517 
1518  const int patchID = s_record.patchID;
1519  const int numAtoms = s_record.numAtoms;
1520  const int offset = s_record.bufferOffset;
1521  const int dst_offset = patchIndex * MigrationCUDAKernel::kMaxAtomsPerPatch;
1522 
1523  __syncthreads();
1524  // Load in if the atom is staying in the same patch
1525  int thread_input[MigrationCUDAKernel::kValuesPerThread];
1526  int thread_output[MigrationCUDAKernel::kValuesPerThread];
1527  int numLocal;
1528  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1529  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1530  if (idx < numAtoms && migrationDestination[offset + idx].x == patchID) {
1531  thread_input[i] = 1;
1532  } else {
1533  thread_input[i] = 0;
1534  }
1535  }
1536  __syncthreads();
1537 
1538  // Compute the prefix sum of local
1539  BlockScan(temp_storage).ExclusiveSum(thread_input, thread_output, numLocal);
1540  __syncthreads();
1541 
1542  for (int i = 0; i < MigrationCUDAKernel::kValuesPerThread; i++) {
1543  const int idx = MigrationCUDAKernel::kValuesPerThread * threadIdx.x + i;
1544  if (idx < numAtoms && thread_input[i]) {
1545  s_local_index[thread_output[i]] = idx;
1546  }
1547  }
1548  if (threadIdx.x == 0) {
1549  localRecords[patchIndex].numAtomsLocal = numLocal;
1550  localRecords[patchIndex].numAtomsNew = numLocal;
1551  }
1552  __syncthreads();
1553 
1554  // We can do this at the atom level instead of the migrationGroup level
1555  // because the migrationDestinations take that into account and
1556  // the prefix sum means we do not change the ordering of local atoms
1557  for (int i = wid; i < numLocal; i += warps_per_threadblock) {
1558  const int src_atom_idx = s_local_index[i];
1559  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1560  int32_t *src_int = (int32_t*) &(atomdata_AoS_in [offset + src_atom_idx]);
1561  int32_t *dst_int = (int32_t*) &(atomdata_AoS_out [dst_offset + i]);
1562  dst_int[tid] = src_int[tid];
1563  }
1564  if (tid == 0) {
1565  migrationDestination[offset + src_atom_idx].w = i;
1566  }
1567 #if defined(NAMD_HIP)
1568  NAMD_WARP_SYNC(WARP_FULL_MASK);
1569 #else
1570  WARP_SYNC(WARP_FULL_MASK);
1571 #endif
1572  }
1573  __syncthreads();
1574  }
1575 }
1576 
1577 void MigrationCUDAKernel::performLocalMigration(
1578  const int numPatches,
1579  CudaLocalRecord* records,
1580  const FullAtom* atomdata_AoS_in,
1581  FullAtom* atomdata_AoS_out,
1582  int4* migrationDestination,
1583  cudaStream_t stream
1584 ) {
1585  constexpr int numThreads = kSortNumThreads;
1586  const int numBlocks = numPatches;
1587 
1588  performLocalMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1589  numPatches,
1590  records,
1591  atomdata_AoS_in,
1592  atomdata_AoS_out,
1593  migrationDestination
1594  );
1595 }
1596 
1597 
1598 /**
1599  * \brief Moves the migrating atoms into their new patches
1600  *
1601  * \par
1602  * Copies the atoms which have moved into new patches
1603  * into those patches scratch buffers. This will use atomic operations to
1604  * determine the new index of each migration group. This happens at the migration
1605  * group level so atoms within a migration group stay in the same order.
1606  *
1607  * Note because not many atoms move, the use of atomics isn't too expensive
1608  * Note because AoS atomic data is ~128 bytes, each warp moves 1 atom
1609  *
1610  */
1611 __global__ void performMigrationKernel(
1612  const int numPatches,
1613  CudaLocalRecord* localRecords,
1614  CudaLocalRecord** peer_records,
1615  const FullAtom* local_atomdata_AoS,
1616  FullAtom** peer_atomdata_AoS,
1617  const int* migrationGroupSize,
1618  const int* migrationGroupIndex,
1619  int4* migrationDestination
1620 ) {
1621  __shared__ CudaLocalRecord s_record;
1622  using AccessType = int32_t;
1623  AccessType* s_record_buffer = (AccessType*) &s_record;
1624 
1625  const int warps_per_threadblock = blockDim.x / WARPSIZE;
1626  const int wid = threadIdx.x / WARPSIZE;
1627  const int tid = threadIdx.x % WARPSIZE;
1628 
1629  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1630  // Read in the CudaLocalRecord using multiple threads. This should
1631  #pragma unroll 1
1632  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1633  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1634  }
1635  __syncthreads();
1636 
1637  const int numMigrationGroups = s_record.numMigrationGroups;
1638  const int offset = s_record.bufferOffset;
1639  const int patchID = s_record.patchID;
1640 
1641  __syncthreads();
1642 
1643  for (int i = wid; i < numMigrationGroups; i += warps_per_threadblock) {
1644  const int startingIndex = migrationGroupIndex[offset + i];
1645 
1646  const int size = migrationGroupSize[offset + startingIndex];
1647 
1648  const int4 migrationInfo = migrationDestination[offset + startingIndex];
1649  const int dst_patchID = migrationInfo.x;
1650  const int dst_device = migrationInfo.y;
1651  const int dst_localID = migrationInfo.z;
1652  const int dst_offset = dst_localID * MigrationCUDAKernel::kMaxAtomsPerPatch;
1653 
1654  if (dst_patchID == patchID) continue;
1655 
1656  // Get index via atomic operation
1657  int dst_atom_idx;
1658  if (tid == 0) {
1659  int* counter_index = &(peer_records[dst_device][dst_localID].numAtomsNew);
1660 #if __CUDA_ARCH__ >= 600 || defined(NAMD_HIP)
1661  dst_atom_idx = atomicAdd_system(counter_index, size);
1662 #else
1663  // support single-GPU Maxwell
1664  dst_atom_idx = atomicAdd(counter_index, size);
1665 #endif
1666  }
1667  dst_atom_idx = WARP_SHUFFLE(WARP_FULL_MASK, dst_atom_idx, 0, WARPSIZE);
1668 
1669 
1670  FullAtom* remote_atomdata_AoS = peer_atomdata_AoS[dst_device];
1671  for (int j = 0; j < size; j++) {
1672  if (tid * 4 < sizeof(FullAtom)) { // Not all threads load in data
1673  int32_t *src_int = (int32_t*) &(local_atomdata_AoS [offset + startingIndex + j]);
1674  int32_t *dst_int = (int32_t*) &(remote_atomdata_AoS [dst_offset + dst_atom_idx + j]);
1675  dst_int[tid] = src_int[tid];
1676  }
1677  if (tid == 0) {
1678  migrationDestination[offset + startingIndex + j].w = dst_atom_idx + j;
1679  }
1680 #if defined(NAMD_HIP)
1681  NAMD_WARP_SYNC(WARP_FULL_MASK);
1682 #else
1683  WARP_SYNC(WARP_FULL_MASK);
1684 #endif
1685  }
1686  }
1687 
1688  __syncthreads();
1689  }
1690 }
1691 
1692 void MigrationCUDAKernel::performMigration(
1693  const int numPatches,
1694  CudaLocalRecord* records,
1695  CudaLocalRecord** peer_records,
1696  const FullAtom* local_atomdata_AoS,
1697  FullAtom** peer_atomdata_AoS,
1698  const int* migrationGroupSize,
1699  const int* migrationGroupIndex,
1700  int4* migrationDestination,
1701  cudaStream_t stream
1702 ) {
1703  constexpr int numThreads = kSortNumThreads;
1704  const int numBlocks = numPatches;
1705 
1706  performMigrationKernel<<<numBlocks, numThreads, 0, stream>>>(
1707  numPatches,
1708  records,
1709  peer_records,
1710  local_atomdata_AoS,
1711  peer_atomdata_AoS,
1712  migrationGroupSize,
1713  migrationGroupIndex,
1714  migrationDestination
1715  );
1716 }
1717 
1718 
1719 /**
1720  * \brief Updates migrationDestination array based on solute-solvent order
1721  *
1722  * \par
1723  * The migrationDestation structure is used later by the tuple migration, so
1724  * we need to keep it accurate after the solute-solvent sorting. This uses peer
1725  * access to update each atom's new index within its new patch.
1726  *
1727  */
1728 __global__ void updateMigrationDestinationKernel(
1729  const int numAtomsHome,
1730  int4* migrationDestination,
1731  int** d_peer_sortSoluteIndex
1732 ) {
1733  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1734  for (int i = tid; i < numAtomsHome; i += blockDim.x * gridDim.x) {
1735  const int4 dest = migrationDestination[i];
1736  const int dst_device = dest.y;
1737  const int dst_patchID = dest.z;
1738  const int dst_patchOffset = MigrationCUDAKernel::kMaxAtomsPerPatch * dst_patchID;
1739  const int inputIndex = dest.w;
1740 
1741  const int outputIndex = d_peer_sortSoluteIndex[dst_device][dst_patchOffset + inputIndex];
1742  migrationDestination[i].w = outputIndex;
1743  }
1744 }
1745 
1746 void MigrationCUDAKernel::updateMigrationDestination(
1747  const int numAtomsHome,
1748  int4* migrationDestination,
1749  int** d_peer_sortSoluteIndex,
1750  cudaStream_t stream
1751 ) {
1752  constexpr int numThreads = kSortNumThreads;
1753  const int numBlocks = (numAtomsHome + kSortNumThreads - 1) / kSortNumThreads;
1754 
1755  updateMigrationDestinationKernel<<<numBlocks, numThreads, 0, stream>>>(
1756  numAtomsHome,
1757  migrationDestination,
1758  d_peer_sortSoluteIndex
1759  );
1760 }
1761 
1762 
1763 /**
1764  * \brief Copies static atomic data to proxy patches
1765  *
1766  * \par
1767  * Copies atomic data that does not change to proxy patches.
1768  *
1769  */
1770 template <bool doAlch>
1771 __global__ void copyDataToProxiesKernel(
1772  const int deviceIndex,
1773  const int numPatchesHome,
1774  const int numPatchesHomeAndProxy,
1775  const CudaLocalRecord* localRecords,
1776  int** peer_id,
1777  int** peer_vdwType,
1778  int** peer_sortOrder,
1779  int** peer_unsortOrder,
1780  float** peer_charge,
1781  int** peer_partition,
1782  double3** peer_patchCenter
1783 ) {
1784  __shared__ CudaLocalRecord s_record;
1785  using AccessType = int32_t;
1786  AccessType* s_record_buffer = (AccessType*) &s_record;
1787 
1788  for (int patchIndex = blockIdx.x + numPatchesHome; patchIndex < numPatchesHomeAndProxy; patchIndex += gridDim.x) {
1789  // Read in the CudaLocalRecord using multiple threads. This should
1790  #pragma unroll 1
1791  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1792  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1793  }
1794  __syncthreads();
1795 
1796  const int numAtoms = s_record.numAtoms;
1797  const int dstOffset = s_record.bufferOffset;
1798  const int srcDevice = s_record.inline_peers[0].deviceIndex;
1799  const int srcOffset = s_record.inline_peers[0].bufferOffset;
1800  const int srcPatchIndex = s_record.inline_peers[0].patchIndex;
1801 
1802  // TODO this data is probably on the host somewhere...
1803  // And it probably doesn't change????
1804  if (threadIdx.x == 0) {
1805  peer_patchCenter[deviceIndex][patchIndex] = peer_patchCenter[srcDevice][srcPatchIndex];
1806  }
1807  __syncthreads();
1808 
1809  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1810  peer_id [deviceIndex][dstOffset + i] = peer_id[srcDevice][srcOffset + i];
1811  peer_vdwType [deviceIndex][dstOffset + i] = peer_vdwType[srcDevice][srcOffset + i];
1812  peer_sortOrder [deviceIndex][dstOffset + i] = peer_sortOrder[srcDevice][srcOffset + i];
1813  peer_unsortOrder[deviceIndex][dstOffset + i] = peer_unsortOrder[srcDevice][srcOffset + i];
1814  peer_charge [deviceIndex][dstOffset + i] = peer_charge[srcDevice][srcOffset + i];
1815  if (doAlch) {
1816  peer_partition[deviceIndex][dstOffset + i] = peer_partition[srcDevice][srcOffset + i];
1817  }
1818  }
1819  __syncthreads();
1820  }
1821 }
1822 
1823 void MigrationCUDAKernel::copyDataToProxies(
1824  const int deviceIndex,
1825  const int numPatchesHome,
1826  const int numPatchesHomeAndProxy,
1827  const CudaLocalRecord* records,
1828  int** peer_id,
1829  int** peer_vdwType,
1830  int** peer_sortOrder,
1831  int** peer_unsortOrder,
1832  float** peer_charge,
1833  int** peer_partition,
1834  double3** peer_patchCenter,
1835  bool doAlch,
1836  cudaStream_t stream
1837 ) {
1838  constexpr int numThreads = kSortNumThreads;
1839  const int numPatches = numPatchesHomeAndProxy - numPatchesHome;
1840  const int numBlocks = numPatches;
1841 
1842  if (doAlch) {
1843  copyDataToProxiesKernel<true><<<numBlocks, numThreads, 0, stream>>>(
1844  deviceIndex,
1845  numPatchesHome, numPatchesHomeAndProxy,
1846  records,
1847  peer_id, peer_vdwType, peer_sortOrder,
1848  peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1849  );
1850  }
1851  else {
1852  copyDataToProxiesKernel<false><<<numBlocks, numThreads, 0, stream>>>(
1853  deviceIndex,
1854  numPatchesHome, numPatchesHomeAndProxy,
1855  records,
1856  peer_id, peer_vdwType, peer_sortOrder,
1857  peer_unsortOrder, peer_charge, peer_partition, peer_patchCenter
1858  );
1859  }
1860 }
1861 
1862 /**
1863  * \brief Copies migrationDestination to proxy patches
1864  *
1865  * \par
1866  * This copies the migrationDestination to proxies to be used in tuple migration
1867  * This needs to use the old bufferOffsets and atom count so it is separate from the
1868  * other copyDataToProxies kernel. Additionally, we do this as a put operation to
1869  * avoid a node barrier. The home patch produces this data and then it needs to write it
1870  * to the proxy patch; it the device which owns the proxy patch was doing a get operation,
1871  * we'd need to have a node barrier to make sure this data was ready to be ready. By using
1872  * put operation, we can do a device local synchonization (i.e. a new kernel launch) to
1873  * make sure the data is ready
1874  *
1875  */
1876 __global__ void copyMigrationDestinationToProxiesKernel(
1877  const int deviceIndex,
1878  const int numPatchesHome,
1879  const int numPatchesHomeAndProxy,
1880  const CudaLocalRecord* localRecords,
1881  const CudaPeerRecord* peerRecords,
1882  int4** peer_migrationDestination
1883 ) {
1884  __shared__ CudaLocalRecord s_record;
1885  using AccessType = int32_t;
1886  AccessType* s_record_buffer = (AccessType*) &s_record;
1887 
1888  for (int patchIndex = blockIdx.x; patchIndex < numPatchesHome; patchIndex += gridDim.x) {
1889  // Read in the CudaLocalRecord using multiple threads. This should
1890  #pragma unroll 1
1891  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1892  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1893  }
1894  __syncthreads();
1895 
1896  const int numAtoms = s_record.numAtomsNew;
1897  const int dstOffset = s_record.bufferOffsetOld;
1898  const int numPeerRecords = s_record.numPeerRecord;
1899  const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
1900 
1901  const int inlineRemotes = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
1902 
1903  for (int remote = 0; remote < inlineRemotes; remote++) {
1904  const int srcDevice = s_record.inline_peers[remote].deviceIndex;
1905  const int srcOffset = s_record.inline_peers[remote].bufferOffset;
1906  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1907  peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1908  }
1909  }
1910 
1911  for (int remote = inlineRemotes; remote < numPeerRecords; remote++) {
1912  const int srcDevice = peerRecords[peerRecordStartIndexs + remote].deviceIndex;
1913  const int srcOffset = peerRecords[peerRecordStartIndexs + remote].bufferOffset;
1914  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
1915  peer_migrationDestination[srcDevice][srcOffset + i] = peer_migrationDestination[deviceIndex][dstOffset + i];
1916  }
1917  }
1918  __syncthreads();
1919  }
1920 }
1921 
1922 void MigrationCUDAKernel::copyMigrationDestinationToProxies(
1923  const int deviceIndex,
1924  const int numPatchesHome,
1925  const int numPatchesHomeAndProxy,
1926  const CudaLocalRecord* records,
1927  const CudaPeerRecord* peerRecords,
1928  int4** peer_migrationDestination,
1929  cudaStream_t stream
1930 ) {
1931  constexpr int numThreads = kSortNumThreads;
1932  const int numBlocks = numPatchesHome;
1933 
1934  copyMigrationDestinationToProxiesKernel<<<numBlocks, numThreads, 0, stream>>>(
1935  deviceIndex,
1936  numPatchesHome, numPatchesHomeAndProxy,
1937  records,
1938  peerRecords,
1939  peer_migrationDestination
1940  );
1941 }
1942 
1943 /**
1944  * \brief Updates the CudaLocalRecord data structure
1945  *
1946  * \par
1947  * This updates the number of atoms and buffer offsets of the patches. This should only
1948  * be ran on the home patches since it uses peer access to update remote structures
1949  *
1950  */
1951 __global__ void updateLocalRecordsKernel(
1952  const int numPatchesHome,
1953  CudaLocalRecord* localRecords,
1954  CudaLocalRecord** peer_records,
1955  const CudaPeerRecord* peerRecords
1956 ) {
1957  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
1958  for (int i = tid; i < numPatchesHome; i += blockDim.x * gridDim.x) {
1959  const int numAtomsOld = localRecords[i].numAtoms;
1960  const int numAtoms = localRecords[i].numAtomsNew;
1961  const int numAtomsNBPad = CudaComputeNonbondedKernel::computeAtomPad(numAtoms);
1962  localRecords[i].numAtoms = numAtoms;
1963  localRecords[i].numAtomsNBPad = numAtomsNBPad;
1964  localRecords[i].bufferOffsetOld = localRecords[i].bufferOffset;
1965  localRecords[i].numAtomsNew = numAtomsOld;
1966 
1967  const int numPeerRecord = localRecords[i].numPeerRecord;
1968  const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
1969  // TODO use inline remotes??
1970 
1971  for (int remote = 0; remote < numPeerRecord; remote++) {
1972  const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
1973  const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
1974  peer_records[srcDevice][srcOffset].numAtoms = numAtoms;
1975  peer_records[srcDevice][srcOffset].numAtomsNBPad = numAtomsNBPad;
1976  }
1977  }
1978 }
1979 
1980 void MigrationCUDAKernel::updateLocalRecords(
1981  const int numPatchesHome,
1982  CudaLocalRecord* records,
1983  CudaLocalRecord** peer_records,
1984  const CudaPeerRecord* peerRecords,
1985  cudaStream_t stream
1986 ) {
1987  const int numThreads = kSortNumThreads;
1988  const int numBlocks = (numPatchesHome + kSortNumThreads - 1) / kSortNumThreads;
1989 
1990  updateLocalRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
1991  numPatchesHome,
1992  records,
1993  peer_records,
1994  peerRecords
1995  );
1996 
1997 }
1998 
1999 
2000 struct RecordToNumAtoms {
2001  __host__ __device__ __forceinline__
2002  int operator()(const CudaLocalRecord &a) const {
2003  return a.numAtoms;
2004  }
2005 };
2006 
2007 struct RecordTonumAtomsNBPad {
2008  __host__ __device__ __forceinline__
2009  int operator()(const CudaLocalRecord &a) const {
2010  return a.numAtomsNBPad;
2011  }
2012 };
2013 
2014 /**
2015  * \brief Updates the buffer offsets based on scratch patchOffsets
2016  *
2017  */
2018 __global__ void updateLocalRecordsOffsetKernel(
2019  const int numPatchesHomeAndProxy,
2020  CudaLocalRecord* localRecords,
2021  int* patchOffsets,
2022  int* patchOffsetsNB
2023 ) {
2024  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2025  for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2026  localRecords[i].bufferOffset = patchOffsets[i];
2027  localRecords[i].bufferOffsetNBPad = patchOffsetsNB[i];
2028  }
2029 }
2030 
2031 void MigrationCUDAKernel::updateLocalRecordsOffset(
2032  const int numPatchesHomeAndProxy,
2033  CudaLocalRecord* records,
2034  cudaStream_t stream
2035 ) {
2036  const int numThreads = kSortNumThreads;
2037  const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2038  size_t temp = patchDeviceScan_alloc;
2039 
2040  //
2041  // Integration Offsets
2042  //
2043  RecordToNumAtoms conversion_op;
2044  using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2045  InputIter iter(records, conversion_op);
2046 
2047  cub::DeviceScan::ExclusiveSum<InputIter>(
2048  d_patchDeviceScan_scratch, temp,
2049  iter, d_patchOffset_temp, numPatchesHomeAndProxy, stream
2050  );
2051 
2052  //
2053  // Nonbonded Offsets
2054  //
2055  RecordTonumAtomsNBPad conversion_op_nb;
2056  using InputIterNB = cub::TransformInputIterator<int, RecordTonumAtomsNBPad, CudaLocalRecord*>;
2057  InputIterNB iterNB(records, conversion_op_nb);
2058 
2059  cub::DeviceScan::ExclusiveSum<InputIterNB>(
2060  d_patchDeviceScan_scratch, temp,
2061  iterNB, d_patchOffsetNB_temp, numPatchesHomeAndProxy, stream
2062  );
2063 
2064  //
2065  // Update AoS data
2066  //
2067  updateLocalRecordsOffsetKernel<<<numBlocks, numThreads, 0, stream>>>(
2068  numPatchesHomeAndProxy,
2069  records,
2070  d_patchOffset_temp,
2071  d_patchOffsetNB_temp
2072  );
2073 }
2074 
2075 /**
2076  * \brief Updates the remote records on this device
2077  *
2078  */
2079 __global__ void updatePeerRecordsKernel(
2080  const int numPatchesHomeAndProxy,
2081  CudaLocalRecord* localRecords,
2082  CudaLocalRecord** peer_records,
2083  CudaPeerRecord* peerRecords
2084 ) {
2085  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
2086  for (int i = tid; i < numPatchesHomeAndProxy; i += blockDim.x * gridDim.x) {
2087  const int numPeerRecord = localRecords[i].numPeerRecord;
2088  const int peerRecordStartIndex = localRecords[i].peerRecordStartIndex;
2089 
2090  for (int remote = 0; remote < numPeerRecord; remote++) {
2091  const int srcDevice = peerRecords[peerRecordStartIndex + remote].deviceIndex;
2092  const int srcOffset = peerRecords[peerRecordStartIndex + remote].patchIndex;
2093 
2094  const int bufferOffset = peer_records[srcDevice][srcOffset].bufferOffset;
2095  const int bufferOffsetNBPad = peer_records[srcDevice][srcOffset].bufferOffsetNBPad;
2096 
2097  peerRecords[peerRecordStartIndex + remote].bufferOffset = bufferOffset;
2098  peerRecords[peerRecordStartIndex + remote].bufferOffsetNBPad = bufferOffsetNBPad;
2099 
2100  if (remote < CudaLocalRecord::num_inline_peer) {
2101  localRecords[i].inline_peers[remote].bufferOffset = bufferOffset;
2102  localRecords[i].inline_peers[remote].bufferOffsetNBPad = bufferOffsetNBPad;
2103  }
2104  }
2105  }
2106 }
2107 
2108 void MigrationCUDAKernel::updatePeerRecords(
2109  const int numPatchesHomeAndProxy,
2110  CudaLocalRecord* records,
2111  CudaLocalRecord** peer_records,
2112  CudaPeerRecord* peerRecords,
2113  cudaStream_t stream
2114 ) {
2115  const int numThreads = kSortNumThreads;
2116  const int numBlocks = (numPatchesHomeAndProxy + kSortNumThreads - 1) / kSortNumThreads;
2117 
2118  updatePeerRecordsKernel<<<numBlocks, numThreads, 0, stream>>>(
2119  numPatchesHomeAndProxy,
2120  records,
2121  peer_records,
2122  peerRecords
2123  );
2124 }
2125 
2126 MigrationCUDAKernel::MigrationCUDAKernel() {
2127  d_patchOffset_temp = NULL;
2128  d_patchOffsetNB_temp = NULL;
2129 
2130  patchDeviceScan_alloc = 0;
2131  d_patchDeviceScan_scratch = NULL;
2132 }
2133 
2134 MigrationCUDAKernel::~MigrationCUDAKernel() {
2135  if (d_patchOffset_temp != NULL) deallocate_device<int>(&d_patchOffset_temp);
2136  if (d_patchOffsetNB_temp != NULL) deallocate_device<int>(&d_patchOffsetNB_temp);
2137  if (d_patchDeviceScan_scratch != NULL) deallocate_device<char>(&d_patchDeviceScan_scratch);
2138 }
2139 
2140 void MigrationCUDAKernel::allocateScratch(const int numPatchesHomeAndProxy) {
2141  allocate_device<int>(&d_patchOffset_temp, numPatchesHomeAndProxy);
2142  allocate_device<int>(&d_patchOffsetNB_temp, numPatchesHomeAndProxy);
2143 
2144  // Fake call to CUB to get required size
2145  RecordToNumAtoms conversion_op;
2146  using InputIter = cub::TransformInputIterator<int, RecordToNumAtoms, CudaLocalRecord*>;
2147  InputIter iter(NULL, conversion_op);
2148 
2149  void *d_temp_storage = NULL;
2150  size_t temp_storage_bytes = 0;
2151 
2152  cub::DeviceScan::ExclusiveSum<InputIter>(
2153  d_temp_storage, temp_storage_bytes,
2154  iter, d_patchOffset_temp, numPatchesHomeAndProxy
2155  );
2156 
2157  patchDeviceScan_alloc = temp_storage_bytes;
2158  allocate_device<char>(&d_patchDeviceScan_scratch, patchDeviceScan_alloc);
2159 }
2160 #endif // NODEGROUP_FORCE_REGISTER
2161