NAMD
SequencerCUDAKernel.cu
Go to the documentation of this file.
1 #include "common.h"
2 #ifdef NAMD_CUDA
3 
4 #if __CUDACC_VER_MAJOR__ >= 11
5 #include <cub/cub.cuh>
6 #else
7 #include <namd_cub/cub.cuh>
8 #endif
9 
10 #endif //NAMD_CUDA
11 
12 #ifdef NAMD_HIP //NAMD_HIP
13 #include <hip/hip_runtime.h>
14 #include <hipcub/hipcub.hpp>
15 #define cub hipcub
16 #endif
17 
18 #include "HipDefines.h"
19 #include "SequencerCUDAKernel.h"
20 #include "MShakeKernel.h"
21 
22 #ifdef NODEGROUP_FORCE_REGISTER
23 
24 #define OVERALLOC 2.0f
25 
26 void NAMD_die(const char *);
27 
28 
29 __constant__ double c_pairlistTrigger;
30 __constant__ double c_pairlistGrow;
31 __constant__ double c_pairlistShrink;
32 
33 __device__ void reset_atomic_counter(unsigned int *counter) {
34  counter[0] = 0;
35  __threadfence();
36 }
37 
38 
39 template <int T_MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
40 __global__ void accumulateForceToSOAKernelMGPU(
41  const int numPatches,
42  CudaLocalRecord* localRecords,
43  const double * __restrict f_bond,
44  const double * __restrict f_bond_nbond,
45  const double * __restrict f_bond_slow,
46  int forceStride,
47  const float4 * __restrict f_nbond,
48  const float4 * __restrict f_nbond_slow,
49  const CudaForce* __restrict f_slow,
50  double * __restrict d_f_global_x,
51  double * __restrict d_f_global_y,
52  double * __restrict d_f_global_z,
53  double * __restrict d_f_normal_x,
54  double * __restrict d_f_normal_y,
55  double * __restrict d_f_normal_z,
56  double * __restrict d_f_nbond_x,
57  double * __restrict d_f_nbond_y,
58  double * __restrict d_f_nbond_z,
59  double * __restrict d_f_slow_x,
60  double * __restrict d_f_slow_y,
61  double * __restrict d_f_slow_z,
62  const int * __restrict patchUnsortOrder,
63  const Lattice lattice,
64  unsigned int** __restrict deviceQueues,
65  unsigned int* __restrict queueCounters,
66  unsigned int* __restrict tbcatomic
67 ) {
68  __shared__ CudaLocalRecord s_record;
69  using AccessType = int32_t;
70  AccessType* s_record_buffer = (AccessType*) &s_record;
71 
72  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
73  // Read in the CudaLocalRecord using multiple threads. This should
74  #pragma unroll 1
75  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
76  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
77  }
78  __syncthreads();
79 
80  const int numAtoms = s_record.numAtoms;
81  const int offset = s_record.bufferOffset;
82  const int offsetNB = s_record.bufferOffsetNBPad;
83 
84  double f_slow_x, f_slow_y, f_slow_z;
85  double f_nbond_x, f_nbond_y, f_nbond_z;
86  double f_normal_x, f_normal_y, f_normal_z;
87 
88  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
89  if (T_MAX_FORCE_NUMBER >= 1) {
90  int unorder = patchUnsortOrder[offset + i];
91  // gather from sorted nonbonded force array
92  float4 fnbond = f_nbond[offsetNB + unorder];
93  // set (medium) nonbonded force accumulators
94  f_nbond_x = (double)fnbond.x;
95  f_nbond_y = (double)fnbond.y;
96  f_nbond_z = (double)fnbond.z;
97  if (T_MAX_FORCE_NUMBER == 2) {
98  // gather from sorted nonbonded slow force array
99  float4 fnbslow = f_nbond_slow[offsetNB + unorder];
100  // accumulate slow force contributions from nonbonded calculation
101  f_slow_x = (double)fnbslow.x;
102  f_slow_y = (double)fnbslow.y;
103  f_slow_z = (double)fnbslow.z;
104  }
105  }
106  // gather from strided bond force array
107  // set (fast) normal force accumulators
108  if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
109  // Global forces is stored in d_f_global, add to bonded force
110  f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
111  f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
112  f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
113  if (T_DOCUDAGLOBAL) {
114  d_f_global_x[offset + i] = 0;
115  d_f_global_y[offset + i] = 0;
116  d_f_global_z[offset + i] = 0;
117  }
118  }
119  else{
120  f_normal_x = f_bond[offset + i];
121  f_normal_y = f_bond[offset + i + forceStride];
122  f_normal_z = f_bond[offset + i + 2*forceStride];
123  }
124  if (T_MAX_FORCE_NUMBER >= 1) {
125  // gather from strided bond nonbonded force array
126  // accumulate (medium) nonbonded force accumulators
127  f_nbond_x += f_bond_nbond[offset + i];
128  f_nbond_y += f_bond_nbond[offset + i + forceStride];
129  f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
130  if (T_MAX_FORCE_NUMBER == 2) {
131  // gather from strided bond slow force array
132  // accumulate slow force accumulators
133  f_slow_x += f_bond_slow[offset + i];
134  f_slow_y += f_bond_slow[offset + i + forceStride];
135  f_slow_z += f_bond_slow[offset + i + 2*forceStride];
136  }
137  }
138  // set normal, nonbonded, and slow SOA force buffers
139  d_f_normal_x[offset + i] = f_normal_x;
140  d_f_normal_y[offset + i] = f_normal_y;
141  d_f_normal_z[offset + i] = f_normal_z;
142  if (T_MAX_FORCE_NUMBER >= 1) {
143  d_f_nbond_x[offset + i] = f_nbond_x;
144  d_f_nbond_y[offset + i] = f_nbond_y;
145  d_f_nbond_z[offset + i] = f_nbond_z;
146  if (T_MAX_FORCE_NUMBER == 2){
147  d_f_slow_x[offset + i] = f_slow_x;
148  d_f_slow_y[offset + i] = f_slow_y;
149  d_f_slow_z[offset + i] = f_slow_z;
150  }
151  }
152  }
153  __syncthreads();
154  }
155 }
156 
157 // DMC This was in the accumulateForceToSOAKernelMGPU (commented out)
158 #if 0
159  __syncthreads();
160  if(threadIdx.x == 0){
161  // Need another value here
162  unsigned int value = atomicInc(&tbcatomic[4], gridDim.x);
163  isLastBlockDone = (value == (gridDim.x - 1));
164  }
165  __syncthreads();
166  if(isLastBlockDone){
167  // thread0 flags everything
168  if(threadIdx.x == 0){
169  for(int i = 0; i < nDev; i++ ){
170  if (i == devID) continue;
171  unsigned int value = atomicInc(& (queueCounters[i]), nDev);
172  printf(" Device[%d] queue[%d] pos[%d]\n", devID, i, value);
173  // deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
174  deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
175  __threadfence_system();
176  }
177  // sets tcbCounter back to zero
178  tbcatomic[4] = 0;
179  printf(" GPU[%d] finished accumulating SOA buffers\n", devID);
180  }
181  }
182 #endif
183 
184 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
185 __global__ void accumulateForceToSOAKernel(
186  const int numPatches,
187  CudaLocalRecord* localRecords,
188  const double * __restrict f_bond,
189  const double * __restrict f_bond_nbond,
190  const double * __restrict f_bond_slow,
191  int forceStride,
192  const float4 * __restrict f_nbond,
193  const float4 * __restrict f_nbond_slow,
194  const CudaForce* __restrict f_slow,
195  double * __restrict d_f_global_x,
196  double * __restrict d_f_global_y,
197  double * __restrict d_f_global_z,
198  double * __restrict d_f_normal_x,
199  double * __restrict d_f_normal_y,
200  double * __restrict d_f_normal_z,
201  double * __restrict d_f_nbond_x,
202  double * __restrict d_f_nbond_y,
203  double * __restrict d_f_nbond_z,
204  double * __restrict d_f_slow_x,
205  double * __restrict d_f_slow_y,
206  double * __restrict d_f_slow_z,
207  const int * __restrict patchUnsortOrder,
208  const Lattice lattice
209 ) {
210  __shared__ CudaLocalRecord s_record;
211  using AccessType = int32_t;
212  AccessType* s_record_buffer = (AccessType*) &s_record;
213 
214  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
215  // Read in the CudaLocalRecord using multiple threads. This should
216  #pragma unroll 1
217  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
218  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
219  }
220  __syncthreads();
221 
222  const int numAtoms = s_record.numAtoms;
223  const int offset = s_record.bufferOffset;
224  const int offsetNB = s_record.bufferOffsetNBPad;
225 
226  double f_slow_x, f_slow_y, f_slow_z;
227  double f_nbond_x, f_nbond_y, f_nbond_z;
228  double f_normal_x, f_normal_y, f_normal_z;
229 
230  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
231  if (MAX_FORCE_NUMBER == 2) {
232  CudaForce f = f_slow[offset + i];
233  double3 f_scaled = lattice.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
234  // set slow force accumulators
235  f_slow_x = f_scaled.x;
236  f_slow_y = f_scaled.y;
237  f_slow_z = f_scaled.z;
238  }
239  if (MAX_FORCE_NUMBER >= 1) {
240  int unorder = patchUnsortOrder[offset + i];
241  // gather from sorted nonbonded force array
242  float4 fnbond = f_nbond[offsetNB + unorder];
243  // set (medium) nonbonded force accumulators
244  f_nbond_x = (double)fnbond.x;
245  f_nbond_y = (double)fnbond.y;
246  f_nbond_z = (double)fnbond.z;
247  if (MAX_FORCE_NUMBER == 2) {
248  // gather from sorted nonbonded slow force array
249  float4 fnbslow = f_nbond_slow[offsetNB + unorder];
250  // accumulate slow force contributions from nonbonded calculation
251  f_slow_x += (double)fnbslow.x;
252  f_slow_y += (double)fnbslow.y;
253  f_slow_z += (double)fnbslow.z;
254  }
255  }
256  // gather from strided bond force array
257  // set (fast) normal force accumulators
258  if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
259  // Global forces is stored in d_f_global, add to bonded force
260  f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
261  f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
262  f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
263  if (T_DOCUDAGLOBAL) {
264  d_f_global_x[offset + i] = 0;
265  d_f_global_y[offset + i] = 0;
266  d_f_global_z[offset + i] = 0;
267  }
268  }
269  else
270  {
271  f_normal_x = f_bond[offset + i];
272  f_normal_y = f_bond[offset + i + forceStride];
273  f_normal_z = f_bond[offset + i + 2*forceStride];
274  }
275 
276  if (MAX_FORCE_NUMBER >= 1) {
277  // gather from strided bond nonbonded force array
278  // accumulate (medium) nonbonded force accumulators
279  f_nbond_x += f_bond_nbond[offset + i];
280  f_nbond_y += f_bond_nbond[offset + i + forceStride];
281  f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
282  if (MAX_FORCE_NUMBER == 2) {
283  // gather from strided bond slow force array
284  // accumulate slow force accumulators
285  f_slow_x += f_bond_slow[offset + i];
286  f_slow_y += f_bond_slow[offset + i + forceStride];
287  f_slow_z += f_bond_slow[offset + i + 2*forceStride];
288  }
289  }
290  // set normal, nonbonded, and slow SOA force buffers
291  d_f_normal_x[offset + i] = f_normal_x;
292  d_f_normal_y[offset + i] = f_normal_y;
293  d_f_normal_z[offset + i] = f_normal_z;
294  if (MAX_FORCE_NUMBER >= 1) {
295  d_f_nbond_x[offset + i] = f_nbond_x;
296  d_f_nbond_y[offset + i] = f_nbond_y;
297  d_f_nbond_z[offset + i] = f_nbond_z;
298  if (MAX_FORCE_NUMBER == 2) {
299  d_f_slow_x[offset + i] = f_slow_x;
300  d_f_slow_y[offset + i] = f_slow_y;
301  d_f_slow_z[offset + i] = f_slow_z;
302  }
303  }
304  }
305  __syncthreads();
306  }
307 }
308 
309 __global__ void accumulatePMEForces(
310  const int numAtoms,
311  const CudaForce* f_slow,
312  double* d_f_slow_x,
313  double* d_f_slow_y,
314  double* d_f_slow_z,
315  const int* patchOffsets,
316  const Lattice lat
317 ){
318  int tid = threadIdx.x + (blockDim.x * blockIdx.x);
319  double f_slow_x, f_slow_y, f_slow_z;
320 
321  if(tid < numAtoms){
322  CudaForce f = f_slow[tid];
323 
324  double3 f_scaled = lat.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
325  f_slow_x = f_scaled.x;
326  f_slow_y = f_scaled.y;
327  f_slow_z = f_scaled.z;
328 
329  d_f_slow_x[tid] += f_slow_x;
330  d_f_slow_y[tid] += f_slow_y;
331  d_f_slow_z[tid] += f_slow_z;
332  }
333 }
334 
335 void SequencerCUDAKernel::accumulateForceToSOA(
336  const int doGlobal,
337  const int doCudaGlobal,
338  const int maxForceNumber,
339  const int numPatches,
340  const int nDevices,
341  CudaLocalRecord* localRecords,
342  const double* f_bond,
343  const double* f_bond_nbond,
344  const double* f_bond_slow,
345  int forceStride,
346  const float4* f_nbond,
347  const float4* f_nbond_slow,
348  const CudaForce* f_slow,
349  double* d_f_global_x,
350  double* d_f_global_y,
351  double* d_f_global_z,
352  double* d_f_normal_x,
353  double* d_f_normal_y,
354  double* d_f_normal_z,
355  double* d_f_nbond_x,
356  double* d_f_nbond_y,
357  double* d_f_nbond_z,
358  double* d_f_slow_x,
359  double* d_f_slow_y,
360  double* d_f_slow_z,
361  const int* patchUnsortOrder,
362  const Lattice lattice,
363  unsigned int** deviceQueues,
364  unsigned int* queueCounters,
365  unsigned int* tbcatomic,
366  cudaStream_t stream
367 ) {
368  // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
369  const int options = (maxForceNumber << 2) + (doGlobal << 1) + doCudaGlobal;
370  if (nDevices > 1) {
371 #define CALL_MGPU(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
372  accumulateForceToSOAKernelMGPU<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
373  <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
374  ( numPatches, localRecords, \
375  f_bond, f_bond_nbond, f_bond_slow, forceStride, \
376  f_nbond, f_nbond_slow, f_slow, \
377  d_f_global_x, d_f_global_y, d_f_global_z, \
378  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
379  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
380  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
381  patchUnsortOrder, lattice, \
382  deviceQueues, queueCounters, tbcatomic \
383  )
384  switch (options) {
385  case ((0<<2) + (0<<1) + (0<<0)): CALL_MGPU(0, 0, 0); break;
386  case ((0<<2) + (0<<1) + (1<<0)): CALL_MGPU(0, 0, 1); break;
387  case ((0<<2) + (1<<1) + (1<<0)): CALL_MGPU(0, 1, 1); break;
388  case ((0<<2) + (1<<1) + (0<<0)): CALL_MGPU(0, 1, 0); break;
389  case ((1<<2) + (0<<1) + (0<<0)): CALL_MGPU(1, 0, 0); break;
390  case ((1<<2) + (0<<1) + (1<<0)): CALL_MGPU(1, 0, 1); break;
391  case ((1<<2) + (1<<1) + (1<<0)): CALL_MGPU(1, 1, 1); break;
392  case ((1<<2) + (1<<1) + (0<<0)): CALL_MGPU(1, 1, 0); break;
393  case ((2<<2) + (0<<1) + (0<<0)): CALL_MGPU(2, 0, 0); break;
394  case ((2<<2) + (0<<1) + (1<<0)): CALL_MGPU(2, 0, 1); break;
395  case ((2<<2) + (1<<1) + (1<<0)): CALL_MGPU(2, 1, 1); break;
396  case ((2<<2) + (1<<1) + (0<<0)): CALL_MGPU(2, 1, 0); break;
397  default: {
398  const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
399  NAMD_bug(error.c_str());
400  }
401  }
402 #undef CALL_MGPU
403  } else {
404 #define CALL(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
405  accumulateForceToSOAKernel<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
406  <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
407  ( numPatches, localRecords, \
408  f_bond, f_bond_nbond, f_bond_slow, forceStride, \
409  f_nbond, f_nbond_slow, f_slow, \
410  d_f_global_x, d_f_global_y, d_f_global_z, \
411  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
412  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
413  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
414  patchUnsortOrder, lattice \
415  )
416  switch (options) {
417  case ((0<<2) + (0<<1) + (0<<0)): CALL(0, 0, 0); break;
418  case ((0<<2) + (0<<1) + (1<<0)): CALL(0, 0, 1); break;
419  case ((0<<2) + (1<<1) + (1<<0)): CALL(0, 1, 1); break;
420  case ((0<<2) + (1<<1) + (0<<0)): CALL(0, 1, 0); break;
421  case ((1<<2) + (0<<1) + (0<<0)): CALL(1, 0, 0); break;
422  case ((1<<2) + (0<<1) + (1<<0)): CALL(1, 0, 1); break;
423  case ((1<<2) + (1<<1) + (1<<0)): CALL(1, 1, 1); break;
424  case ((1<<2) + (1<<1) + (0<<0)): CALL(1, 1, 0); break;
425  case ((2<<2) + (0<<1) + (0<<0)): CALL(2, 0, 0); break;
426  case ((2<<2) + (0<<1) + (1<<0)): CALL(2, 0, 1); break;
427  case ((2<<2) + (1<<1) + (1<<0)): CALL(2, 1, 1); break;
428  case ((2<<2) + (1<<1) + (0<<0)): CALL(2, 1, 0); break;
429  default: {
430  const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
431  NAMD_bug(error.c_str());
432  }
433  }
434 #undef CALL
435  }
436 }
437 
438 template <int T_MAX_FORCE_NUMBER>
439 __global__ void mergeForcesFromPeersKernel(
440  const int numPatches,
441  const int devID, // GPU ID
442  CudaLocalRecord* localRecords,
443  CudaPeerRecord* peerRecords,
444  const Lattice lat,
445  double** __restrict f_normal_x,
446  double** __restrict f_normal_y,
447  double** __restrict f_normal_z,
448  double** __restrict f_nbond_x,
449  double** __restrict f_nbond_y,
450  double** __restrict f_nbond_z,
451  double** __restrict f_slow_x,
452  double** __restrict f_slow_y,
453  double** __restrict f_slow_z,
454  const CudaForce* __restrict pmeForces
455 ) {
456  __shared__ CudaLocalRecord s_record;
457  using AccessType = int32_t;
458  AccessType* s_record_buffer = (AccessType*) &s_record;
459 
460  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
461  // Read in the CudaLocalRecord using multiple threads. This should
462  #pragma unroll 1
463  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
464  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
465  }
466  __syncthreads();
467 
468  const int numAtoms = s_record.numAtoms;
469  const int dstOffset = s_record.bufferOffset;
470  const int numPeerRecords = s_record.numPeerRecord;
471  const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
472 
473  const int inlinePeers = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
474 
475  for (int peer = 0; peer < inlinePeers; peer++) {
476  const int srcDevice = s_record.inline_peers[peer].deviceIndex;
477  const int srcOffset = s_record.inline_peers[peer].bufferOffset;
478 
479  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
480  double f_x = f_normal_x[srcDevice][srcOffset + i];
481  double f_y = f_normal_y[srcDevice][srcOffset + i];
482  double f_z = f_normal_z[srcDevice][srcOffset + i];
483 
484  f_normal_x[devID][dstOffset + i] += f_x;
485  f_normal_y[devID][dstOffset + i] += f_y;
486  f_normal_z[devID][dstOffset + i] += f_z;
487 
488  if(T_MAX_FORCE_NUMBER >= 1){
489  // We don't need the nonbonded offset here since
490  // this isn't the actual nonbonded buffer
491  f_x = f_nbond_x[srcDevice][srcOffset + i];
492  f_y = f_nbond_y[srcDevice][srcOffset + i];
493  f_z = f_nbond_z[srcDevice][srcOffset + i];
494 
495  f_nbond_x[devID][dstOffset + i] += f_x;
496  f_nbond_y[devID][dstOffset + i] += f_y;
497  f_nbond_z[devID][dstOffset + i] += f_z;
498 
499  if(T_MAX_FORCE_NUMBER == 2){
500  f_x = f_slow_x[srcDevice][srcOffset + i];
501  f_y = f_slow_y[srcDevice][srcOffset + i];
502  f_z = f_slow_z[srcDevice][srcOffset + i];
503 
504  f_slow_x[devID][dstOffset + i] += f_x;
505  f_slow_y[devID][dstOffset + i] += f_y;
506  f_slow_z[devID][dstOffset + i] += f_z;
507  }
508  }
509  }
510  }
511 
512  for (int peer = inlinePeers; peer < numPeerRecords; peer++) {
513  const int srcDevice = peerRecords[peerRecordStartIndexs + peer].deviceIndex;
514  const int srcOffset = peerRecords[peerRecordStartIndexs + peer].bufferOffset;
515 
516  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
517  double f_x = f_normal_x[srcDevice][srcOffset + i];
518  double f_y = f_normal_y[srcDevice][srcOffset + i];
519  double f_z = f_normal_z[srcDevice][srcOffset + i];
520 
521  f_normal_x[devID][dstOffset + i] += f_x;
522  f_normal_y[devID][dstOffset + i] += f_y;
523  f_normal_z[devID][dstOffset + i] += f_z;
524 
525  if(T_MAX_FORCE_NUMBER >= 1){
526  // We don't need the nonbonded offset here since
527  // this isn't the actual nonbonded buffer
528  f_x = f_nbond_x[srcDevice][srcOffset + i];
529  f_y = f_nbond_y[srcDevice][srcOffset + i];
530  f_z = f_nbond_z[srcDevice][srcOffset + i];
531 
532  f_nbond_x[devID][dstOffset + i] += f_x;
533  f_nbond_y[devID][dstOffset + i] += f_y;
534  f_nbond_z[devID][dstOffset + i] += f_z;
535 
536  if(T_MAX_FORCE_NUMBER == 2){
537  f_x = f_slow_x[srcDevice][srcOffset + i];
538  f_y = f_slow_y[srcDevice][srcOffset + i];
539  f_z = f_slow_z[srcDevice][srcOffset + i];
540 
541  f_slow_x[devID][dstOffset + i] += f_x;
542  f_slow_y[devID][dstOffset + i] += f_y;
543  f_slow_z[devID][dstOffset + i] += f_z;
544  }
545  }
546  }
547  }
548 
549  // Merge PME forces here instead of in the fetch forcing kernel
550  if(T_MAX_FORCE_NUMBER == 2){
551  for(int i = threadIdx.x; i < numAtoms; i += blockDim.x){
552  CudaForce f = pmeForces[dstOffset + i];
553 
554  double3 f_scaled = lat.scale_force(
555  Vector((double)f.x, (double)f.y, (double)f.z));
556  f_slow_x[devID][dstOffset + i] += f_scaled.x;
557  f_slow_y[devID][dstOffset + i] += f_scaled.y;
558  f_slow_z[devID][dstOffset + i] += f_scaled.z;
559  }
560  }
561  __syncthreads();
562  }
563 }
564 
565 void SequencerCUDAKernel::mergeForcesFromPeers(
566  const int devID,
567  const int maxForceNumber,
568  const Lattice lat,
569  const int numPatchesHomeAndProxy,
570  const int numPatchesHome,
571  double** f_normal_x,
572  double** f_normal_y,
573  double** f_normal_z,
574  double** f_nbond_x,
575  double** f_nbond_y,
576  double** f_nbond_z,
577  double** f_slow_x,
578  double** f_slow_y,
579  double** f_slow_z,
580  const CudaForce* pmeForces,
581  CudaLocalRecord* localRecords,
582  CudaPeerRecord* peerRecords,
583  std::vector<int>& atomCounts,
584  cudaStream_t stream
585 ){
586  // atom-based kernel here
587  //int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
588  int grid = (numPatchesHome);
589  int blocks = PATCH_BLOCKS;
590  // launch a single-threaded grid for debugging
591 
592  int offset = 0;
593  for (int i = 0; i < devID; i++) {
594  offset += atomCounts[i];
595  }
596 
597  switch (maxForceNumber) {
598  case 0:
599  mergeForcesFromPeersKernel<0><<<grid, blocks, 0, stream>>>(
600  numPatchesHome, devID, localRecords, peerRecords,
601  lat,
602  f_normal_x, f_normal_y, f_normal_z,
603  f_nbond_x, f_nbond_y, f_nbond_z,
604  f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
605  );
606  break;
607  case 1:
608  mergeForcesFromPeersKernel<1><<<grid, blocks, 0, stream>>>(
609  numPatchesHome, devID, localRecords, peerRecords,
610  lat,
611  f_normal_x, f_normal_y, f_normal_z,
612  f_nbond_x, f_nbond_y, f_nbond_z,
613  f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
614  );
615  break;
616  case 2:
617  mergeForcesFromPeersKernel<2><<<grid, blocks, 0, stream>>>(
618  numPatchesHome, devID, localRecords, peerRecords,
619  lat,
620  f_normal_x, f_normal_y, f_normal_z,
621  f_nbond_x, f_nbond_y, f_nbond_z,
622  f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
623  );
624  break;
625  }
626 }
627 
628 // JM NOTE: This is a fused version of accumulateForceToSOA + addForceToMomentum.
629 // addForceToMomentum barely has any math and is memory-bandwith bound, so
630 // fusing these kernels allows us to keep the forces in the registers and
631 // reusing its values for velocities
632 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL, int DO_FIXED>
633 __global__ void accumulateForceKick(
634  const int numPatches,
635  CudaLocalRecord* localRecords,
636  const double * __restrict f_bond,
637  const double * __restrict f_bond_nbond,
638  const double * __restrict f_bond_slow,
639  int forceStride,
640  const float4 * __restrict f_nbond,
641  const float4 * __restrict f_nbond_slow,
642  const CudaForce* __restrict f_slow,
643  double * __restrict d_f_global_x,
644  double * __restrict d_f_global_y,
645  double * __restrict d_f_global_z,
646  double * __restrict d_f_normal_x,
647  double * __restrict d_f_normal_y,
648  double * __restrict d_f_normal_z,
649  double * __restrict d_f_nbond_x,
650  double * __restrict d_f_nbond_y,
651  double * __restrict d_f_nbond_z,
652  double * __restrict d_f_slow_x,
653  double * __restrict d_f_slow_y,
654  double * __restrict d_f_slow_z,
655  const int * __restrict patchUnsortOrder,
656  const Lattice lattice,
657  double *d_vel_x,
658  double *d_vel_y,
659  double *d_vel_z,
660  const double * __restrict recipMass,
661  const int* __restrict atomFixed,
662  const double dt_normal,
663  const double dt_nbond,
664  const double dt_slow,
665  const double scaling
666 ) {
667  __shared__ CudaLocalRecord s_record;
668  using AccessType = int32_t;
669  AccessType* s_record_buffer = (AccessType*) &s_record;
670 
671  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
672  // Read in the CudaLocalRecord using multiple threads. This should
673  #pragma unroll 1
674  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
675  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
676  }
677  __syncthreads();
678 
679  const int numAtoms = s_record.numAtoms;
680  const int offset = s_record.bufferOffset;
681  const int offsetNB = s_record.bufferOffsetNBPad;
682 
683  double f_slow_x, f_slow_y, f_slow_z;
684  double f_nbond_x, f_nbond_y, f_nbond_z;
685  double f_normal_x, f_normal_y, f_normal_z;
686 
687  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
688  f_slow_x = 0.0;
689  f_slow_y = 0.0;
690  f_slow_z = 0.0;
691  f_nbond_x = 0.0;
692  f_nbond_y = 0.0;
693  f_nbond_z = 0.0;
694  f_normal_x = 0.0;
695  f_normal_y = 0.0;
696  f_normal_z = 0.0;
697 
698  if (MAX_FORCE_NUMBER == 2) {
699  CudaForce f = f_slow[offset + i];
700 
701  double3 f_scaled = lattice.scale_force(
702  Vector((double)f.x, (double)f.y, (double)f.z));
703 
704  // set slow force accumulators
705  f_slow_x = f_scaled.x;
706  f_slow_y = f_scaled.y;
707  f_slow_z = f_scaled.z;
708  }
709  if (MAX_FORCE_NUMBER >= 1) {
710  int unorder = patchUnsortOrder[offset + i];
711  // gather from sorted nonbonded force array
712  float4 fnbond = f_nbond[offsetNB + unorder];
713  // set (medium) nonbonded force accumulators
714  f_nbond_x = (double)fnbond.x;
715  f_nbond_y = (double)fnbond.y;
716  f_nbond_z = (double)fnbond.z;
717  if (MAX_FORCE_NUMBER == 2) {
718  // gather from sorted nonbonded slow force array
719  float4 fnbslow = f_nbond_slow[offsetNB + unorder];
720  // accumulate slow force contributions from nonbonded calculation
721  f_slow_x += (double)fnbslow.x;
722  f_slow_y += (double)fnbslow.y;
723  f_slow_z += (double)fnbslow.z;
724  }
725  }
726  // gather from strided bond force array
727  // set (fast) normal force accumulators
728  if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
729  // Global forces is stored in d_f_global, add to bonded force
730  f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
731  f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
732  f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
733  if (T_DOCUDAGLOBAL) {
734  d_f_global_x[offset + i] = 0;
735  d_f_global_y[offset + i] = 0;
736  d_f_global_z[offset + i] = 0;
737  }
738  }
739  else
740  {
741 
742  f_normal_x = f_bond[offset + i];
743  f_normal_y = f_bond[offset + i + forceStride];
744  f_normal_z = f_bond[offset + i + 2*forceStride];
745  }
746 
747  if (MAX_FORCE_NUMBER >= 1) {
748  // gather from strided bond nonbonded force array
749  // accumulate (medium) nonbonded force accumulators
750  f_nbond_x += f_bond_nbond[offset + i];
751  f_nbond_y += f_bond_nbond[offset + i + forceStride];
752  f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
753  if (MAX_FORCE_NUMBER == 2) {
754  // gather from strided bond slow force array
755  // accumulate slow force accumulators
756  f_slow_x += f_bond_slow[offset + i];
757  f_slow_y += f_bond_slow[offset + i + forceStride];
758  f_slow_z += f_bond_slow[offset + i + 2*forceStride];
759  }
760  }
761  // set normal, nonbonded, and slow SOA force buffers
762  d_f_normal_x[offset + i] = f_normal_x;
763  d_f_normal_y[offset + i] = f_normal_y;
764  d_f_normal_z[offset + i] = f_normal_z;
765  if (MAX_FORCE_NUMBER >= 1) {
766  d_f_nbond_x[offset + i] = f_nbond_x;
767  d_f_nbond_y[offset + i] = f_nbond_y;
768  d_f_nbond_z[offset + i] = f_nbond_z;
769  if (MAX_FORCE_NUMBER == 2) {
770  d_f_slow_x[offset + i] = f_slow_x;
771  d_f_slow_y[offset + i] = f_slow_y;
772  d_f_slow_z[offset + i] = f_slow_z;
773  }
774  }
775 
776  /* Velocity updates */
777  double rp = recipMass[offset + i];
778  double vx, vy, vz;
779  vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
780  vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
781  vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
782  if (!DO_FIXED) {
783  d_vel_x[offset + i] += vx;
784  d_vel_y[offset + i] += vy;
785  d_vel_z[offset + i] += vz;
786  } else {
787  if (!atomFixed[offset + i]) {
788  d_vel_x[offset + i] += vx;
789  d_vel_y[offset + i] += vy;
790  d_vel_z[offset + i] += vz;
791  } else {
792  d_vel_x[offset + i] = 0;
793  d_vel_y[offset + i] = 0;
794  d_vel_z[offset + i] = 0;
795  }
796  }
797  }
798  __syncthreads();
799  }
800 }
801 
802 
803 #if 0
804 template <int MAX_FORCE_NUMBER>
805 __global__ void accumulateForceKick(
806  const double * __restrict f_bond,
807  const double * __restrict f_bond_nbond,
808  const double * __restrict f_bond_slow,
809  int forceStride,
810  const PatchRecord * __restrict bond_pr,
811  const float4 * __restrict f_nbond,
812  const float4 * __restrict f_nbond_slow,
813  const CudaPatchRecord * __restrict nbond_pr,
814  const CudaForce * __restrict f_slow,
815  double *d_f_normal_x,
816  double *d_f_normal_y,
817  double *d_f_normal_z,
818  double *d_f_nbond_x,
819  double *d_f_nbond_y,
820  double *d_f_nbond_z,
821  double *d_f_slow_x,
822  double *d_f_slow_y,
823  double *d_f_slow_z,
824  double *d_vel_x,
825  double *d_vel_y,
826  double *d_vel_z,
827  const double * __restrict recipMass,
828  const double dt_normal,
829  const double dt_nbond,
830  const double dt_slow,
831  const double scaling,
832  const int * __restrict patchIDs,
833  const int * __restrict patchOffsets,
834  const int * __restrict patchUnsortOrder,
835  const CudaLattice lat)
836 {
837  int tid = threadIdx.x;
838  int stride = blockDim.x;
839 
840  __shared__ int sh_patchID;
841  __shared__ int sh_patchOffset;
842  // number of atoms per patch should be same no matter
843  // which data structure we access it through
844  __shared__ int sh_natoms;
845  __shared__ int sh_bondForceOffset;
846  __shared__ int sh_nbondForceOffset;
847 
848  // int patchID;
849  int patchOffset;
850  // number of atoms per patch should be same no matter
851  // which data structure we access it through
852  int natoms;
853  int bondForceOffset;
854  int nbondForceOffset;
855 
856  if(threadIdx.x == 0){
857  sh_patchID = patchIDs[blockIdx.x];
858  sh_patchOffset = patchOffsets[blockIdx.x];
859  // number of atoms per patch should be same no matter
860  // which data structure we access it through
861  sh_natoms = bond_pr[sh_patchID].numAtoms;
862  sh_bondForceOffset = bond_pr[sh_patchID].atomStart;
863  if(MAX_FORCE_NUMBER >= 1){
864  sh_nbondForceOffset = nbond_pr[sh_patchID].atomStart;
865  }
866  }
867 
868  __syncthreads();
869 
870  // pull stuff to registers
871  // patchID = sh_patchID;
872  patchOffset = sh_patchOffset;
873  natoms = sh_natoms;
874  bondForceOffset = sh_bondForceOffset;
875  nbondForceOffset = (MAX_FORCE_NUMBER >= 1) ? sh_nbondForceOffset : 0;
876 
877  double r1x, r1y, r1z, r2x, r2y, r2z, r3x, r3y, r3z;
878 
879  if (MAX_FORCE_NUMBER == 2) {
880  r1x = lat.b1.x;
881  r1y = lat.b1.y;
882  r1z = lat.b1.z;
883  r2x = lat.b2.x;
884  r2y = lat.b2.y;
885  r2z = lat.b2.z;
886  r3x = lat.b3.x;
887  r3y = lat.b3.y;
888  r3z = lat.b3.z;
889  }
890 
891  double f_slow_x, f_slow_y, f_slow_z;
892  double f_nbond_x, f_nbond_y, f_nbond_z;
893  double f_normal_x, f_normal_y, f_normal_z;
894  double vx, vy, vz;
895 
896  for (int i = tid; i < natoms; i += stride) {
897  f_slow_x = 0.0;
898  f_slow_y = 0.0;
899  f_slow_z = 0.0;
900  f_nbond_x = 0.0;
901  f_nbond_y = 0.0;
902  f_nbond_z = 0.0;
903  f_normal_x = 0.0;
904  f_normal_y = 0.0;
905  f_normal_z = 0.0;
906 
907  if(T_DOGLOBAL) {
908  // Global forces is stored in d_f_normal, just need to add bonded force
909  f_normal_x = d_f_normal_x[patchOffset + i];
910  f_normal_y = d_f_normal_y[patchOffset + i];
911  f_normal_z = d_f_normal_z[patchOffset + i];
912  }
913 
914  if (MAX_FORCE_NUMBER == 2) {
915  CudaForce f = f_slow[patchOffset + i];
916  double fx = (double) f.x;
917  double fy = (double) f.y;
918  double fz = (double) f.z;
919  // set slow force accumulators
920  f_slow_x = (fx*r1x + fy*r2x + fz*r3x);
921  f_slow_y = (fx*r1y + fy*r2y + fz*r3y);
922  f_slow_z = (fx*r1z + fy*r2z + fz*r3z);
923  }
924  if (MAX_FORCE_NUMBER >= 1) {
925  int unorder = patchUnsortOrder[patchOffset + i];
926  // gather from sorted nonbonded force array
927  float4 fnbond = f_nbond[nbondForceOffset + unorder];
928  // set (medium) nonbonded force accumulators
929  f_nbond_x = (double)fnbond.x;
930  f_nbond_y = (double)fnbond.y;
931  f_nbond_z = (double)fnbond.z;
932  if (MAX_FORCE_NUMBER == 2) {
933  // gather from sorted nonbonded slow force array
934  float4 fnbslow = f_nbond_slow[nbondForceOffset + unorder];
935  // accumulate slow force contributions from nonbonded calculation
936  f_slow_x += (double)fnbslow.x;
937  f_slow_y += (double)fnbslow.y;
938  f_slow_z += (double)fnbslow.z;
939  }
940  }
941  // gather from strided bond force array
942  // set (fast) normal force accumulators
943  f_normal_x += f_bond[bondForceOffset + i];
944  f_normal_y += f_bond[bondForceOffset + i + forceStride];
945  f_normal_z += f_bond[bondForceOffset + i + 2*forceStride];
946  if (MAX_FORCE_NUMBER >= 1) {
947  // gather from strided bond nonbonded force array
948  // accumulate (medium) nonbonded force accumulators
949  f_nbond_x += f_bond_nbond[bondForceOffset + i];
950  f_nbond_y += f_bond_nbond[bondForceOffset + i + forceStride];
951  f_nbond_z += f_bond_nbond[bondForceOffset + i + 2*forceStride];
952  if (MAX_FORCE_NUMBER == 2) {
953  // gather from strided bond slow force array
954  // accumulate slow force accumulators
955  f_slow_x += f_bond_slow[bondForceOffset + i];
956  f_slow_y += f_bond_slow[bondForceOffset + i + forceStride];
957  f_slow_z += f_bond_slow[bondForceOffset + i + 2*forceStride];
958  }
959  }
960  // set normal, nonbonded, and slow SOA force buffers
961  d_f_normal_x[patchOffset + i] = f_normal_x;
962  d_f_normal_y[patchOffset + i] = f_normal_y;
963  d_f_normal_z[patchOffset + i] = f_normal_z;
964 
965  if (MAX_FORCE_NUMBER >= 1) {
966  d_f_nbond_x[patchOffset + i] = f_nbond_x;
967  d_f_nbond_y[patchOffset + i] = f_nbond_y;
968  d_f_nbond_z[patchOffset + i] = f_nbond_z;
969  if (MAX_FORCE_NUMBER == 2) {
970  d_f_slow_x[patchOffset + i] = f_slow_x;
971  d_f_slow_y[patchOffset + i] = f_slow_y;
972  d_f_slow_z[patchOffset + i] = f_slow_z;
973  }
974  }
975 
976  /* Velocity updates */
977  double rp = recipMass[patchOffset + i];
978  vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
979  vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
980  vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
981 
982  d_vel_x[patchOffset + i] += vx;
983  d_vel_y[patchOffset + i] += vy;
984  d_vel_z[patchOffset + i] += vz;
985  }
986 }
987 #endif
988 
989 
990 void SequencerCUDAKernel::accumulate_force_kick(
991  const bool doFixedAtoms,
992  const int doGlobal,
993  const int doCudaGlobal,
994  const int maxForceNumber,
995  const int numPatches,
996  CudaLocalRecord* localRecords,
997  const double* f_bond,
998  const double* f_bond_nbond,
999  const double* f_bond_slow,
1000  int forceStride,
1001  const float4* f_nbond,
1002  const float4* f_nbond_slow,
1003  const CudaForce* f_slow,
1004  double* d_f_global_x,
1005  double* d_f_global_y,
1006  double* d_f_global_z,
1007  double* d_f_normal_x,
1008  double* d_f_normal_y,
1009  double* d_f_normal_z,
1010  double* d_f_nbond_x,
1011  double* d_f_nbond_y,
1012  double* d_f_nbond_z,
1013  double* d_f_slow_x,
1014  double* d_f_slow_y,
1015  double* d_f_slow_z,
1016  double* d_vel_x,
1017  double* d_vel_y,
1018  double* d_vel_z,
1019  const double* recipMass,
1020  const int* d_atomFixed,
1021  const double dt_normal,
1022  const double dt_nbond,
1023  const double dt_slow,
1024  const double scaling,
1025  const int* patchUnsortOrder,
1026  const Lattice lattice,
1027  cudaStream_t stream
1028 ) {
1029  // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
1030  const int options = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms);
1031 #define CALL(MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED) \
1032  accumulateForceKick<MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED> \
1033  <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
1034  ( numPatches, localRecords, \
1035  f_bond, f_bond_nbond, f_bond_slow, forceStride, \
1036  f_nbond, f_nbond_slow, f_slow, \
1037  d_f_global_x, d_f_global_y, d_f_global_z, \
1038  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
1039  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
1040  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
1041  patchUnsortOrder, lattice, \
1042  d_vel_x, d_vel_y, d_vel_z, recipMass, \
1043  d_atomFixed, dt_normal, dt_nbond, dt_slow, scaling)
1044  /*
1045  * Generated by the following python script:
1046  * #!/usr/bin/env python3
1047  * for maxForceNumber in range(3):
1048  * for doGlobal in range(2):
1049  * for doCudaGlobal in range(2):
1050  * for doFixedAtoms in range(2):
1051  * option = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms)
1052  * print(f' case {option}: CALL ({maxForceNumber}, {doGlobal}, {doCudaGlobal}, {doFixedAtoms}); break;')
1053  */
1054  switch (options) {
1055  case 0: CALL (0, 0, 0, 0); break;
1056  case 1: CALL (0, 0, 0, 1); break;
1057  case 2: CALL (0, 0, 1, 0); break;
1058  case 3: CALL (0, 0, 1, 1); break;
1059  case 4: CALL (0, 1, 0, 0); break;
1060  case 5: CALL (0, 1, 0, 1); break;
1061  case 6: CALL (0, 1, 1, 0); break;
1062  case 7: CALL (0, 1, 1, 1); break;
1063  case 8: CALL (1, 0, 0, 0); break;
1064  case 9: CALL (1, 0, 0, 1); break;
1065  case 10: CALL (1, 0, 1, 0); break;
1066  case 11: CALL (1, 0, 1, 1); break;
1067  case 12: CALL (1, 1, 0, 0); break;
1068  case 13: CALL (1, 1, 0, 1); break;
1069  case 14: CALL (1, 1, 1, 0); break;
1070  case 15: CALL (1, 1, 1, 1); break;
1071  case 16: CALL (2, 0, 0, 0); break;
1072  case 17: CALL (2, 0, 0, 1); break;
1073  case 18: CALL (2, 0, 1, 0); break;
1074  case 19: CALL (2, 0, 1, 1); break;
1075  case 20: CALL (2, 1, 0, 0); break;
1076  case 21: CALL (2, 1, 0, 1); break;
1077  case 22: CALL (2, 1, 1, 0); break;
1078  case 23: CALL (2, 1, 1, 1); break;
1079  default: {
1080  const std::string error =
1081  "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " +
1082  std::to_string(maxForceNumber) +
1083  ", doGlobal = " + std::to_string(doGlobal) +
1084  ", doFixedAtoms = " + std::to_string(doFixedAtoms) +
1085  ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
1086  NAMD_bug(error.c_str());
1087  }
1088  }
1089 #undef CALL
1090 }
1091 
1092 template <bool doSlow>
1093 __global__ void copyFNBondToSOA(float4* __restrict__ f_nbond,
1094  float4* __restrict__ f_nbond_slow,
1095  double* f_nbond_x,
1096  double* f_nbond_y,
1097  double* f_nbond_z,
1098  double* f_slow_x,
1099  double* f_slow_y,
1100  double* f_slow_z,
1101  const int* __restrict__ patchIDs,
1102  const int* __restrict__ patchOffsets,
1103  const int* __restrict__ patchUnsortOrder,
1104  const CudaPatchRecord* __restrict__ nbondIndexPerPatch)
1105 {
1106  int unorder;
1107  int stride = blockDim.x;
1108  int pid = patchIDs[blockIdx.x];
1109 
1110  // nbondIndexPerPatch is organized on a per local ID on the compute
1111  // It's also device-wide, which means it hopefully has more patches than
1112  // PatchMap but that means the ordering is different. I can't use the
1113  // PatchID here to index nbondIndexPerPatch I need something else.
1114  // Each patchID from patchMap needs to map somewhere to nbondIndexPerPatch
1115  // Means I can't use patchIDs anymore. I need to use a different ordering
1116  // straight from the computes nbondIndexPerPatch is the same for
1117  // both multi and single PE. What changes is force storage?
1118  // The only thing that needs updating is the usage of the local compute index
1119  // instead of using the global patchID.
1120  // I need the local compute index
1121  // what do... Patches never migrate, so hopefully this datastructure will never change.
1122  // how do I map????
1123  // I can build an index at patchMap with the ordering inside the computes.
1124  int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1125  int patchOffset = patchOffsets[blockIdx.x];
1126  int natom = nbondIndexPerPatch[pid].numAtoms;
1127  for(int i = threadIdx.x; i < natom; i+= stride){
1128  unorder = patchUnsortOrder[patchOffset + i];
1129  float4 force = f_nbond[globalForceOffset + unorder];
1130  // SOA GLOBAL
1131  f_nbond_x[patchOffset + i] = (double)force.x;
1132  f_nbond_y[patchOffset + i] = (double)force.y;
1133  f_nbond_z[patchOffset + i] = (double)force.z;
1134 
1135  if(doSlow){
1136  // XXX Accumulation here works because each f_slow_* buffer
1137  // is first initialized in copyFSlowToSOA.
1138  float4 f_nb_slow = f_nbond_slow[globalForceOffset + unorder];
1139  f_slow_x[patchOffset + i] += (double)f_nb_slow.x;
1140  f_slow_y[patchOffset + i] += (double)f_nb_slow.y;
1141  f_slow_z[patchOffset + i] += (double)f_nb_slow.z;
1142  }
1143  }
1144 }
1145 
1146 void SequencerCUDAKernel::copy_nbond_forces(int numPatches,
1147  float4* f_nbond,
1148  float4* f_nbond_slow,
1149  double* f_nbond_x,
1150  double* f_nbond_y,
1151  double* f_nbond_z,
1152  double* f_slow_x,
1153  double* f_slow_y,
1154  double* f_slow_z,
1155  const int* patchIDs,
1156  const int* patchOffsets,
1157  const int* patchUnsortOrder,
1158  const CudaPatchRecord* pr,
1159  const bool doSlow,
1160  cudaStream_t stream){
1161 
1162  if(doSlow){
1163  copyFNBondToSOA<1><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1164  f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1165  patchIDs, patchOffsets, patchUnsortOrder, pr);
1166  }else {
1167  copyFNBondToSOA<0><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1168  f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1169  patchIDs, patchOffsets, patchUnsortOrder, pr);
1170  }
1171 }
1172 
1173 #if 0
1174 // this function will fuse copyFNBond, copyFbond, addForceToMomentum and updateVelocities (hopefully)
1175 // XXX TODO: missing slow forces support
1176 template<bool T_DONBOND, bool T_DOSLOW>
1177 __global__ void fetchForcesAndUpdateVelocitiesKernel(
1178  const double scaling,
1179  double dt_normal,
1180  double dt_nbond,
1181  double dt_slow,
1182  /* copyFbondToSOA parameters*/
1183  const double* __restrict__ f_bond, // forces from bond compute
1184  const double* __restrict__ f_bond_nbond, // forces from bond compute
1185  double* __restrict__ f_bond_x,
1186  double* __restrict__ f_bond_y,
1187  double* __restrict__ f_bond_z,
1188  double* __restrict__ f_nbond_x,
1189  double* __restrict__ f_nbond_y,
1190  double* __restrict__ f_nbond_z,
1191  double* __restrict__ f_slow_x,
1192  double* __restrict__ f_slow_y,
1193  double* __restrict__ f_slow_z,
1194  int forceStride,
1195  const PatchRecord* __restrict__ b_pr,
1196  const int* __restrict__ patchIDs,
1197  const int* __restrict__ patchOffsets,
1198  /* copyFNbondToSOA parameters*/
1199  const float4* __restrict__ f_nbond, // forces from bbond compute
1200  const float4* __restrict__ f_nbond_slow, // forces from bond compute
1201  const int* __restrict__ patchSortOrder,
1202  const CudaPatchRecord* __restrict__ nbondIndexPerPatch,
1203  /* copyFSlowToSOA */
1204  const CudaForce* __restrict__ f_slow,
1205  const int* __restrict__ patchPositions,
1206  const int* __restrict__ pencilPatchIndex,
1207  const int* __restrict__ patchOffsets,
1208  const int* __restrict__ patchIDs,
1209  const int* __restrict__ slow_patchIDs,
1210  const CudaLattice* __restrict__ lattices,
1211  /* addForceToMomentum */
1212  const double * __restrict recipMass,
1213  double * __restrict vel_x,
1214  double * __restrict vel_y,
1215  double * __restrict vel_z,
1216  /* updateRigidArrays */
1217  const double * __restrict pos_x,
1218  const double * __restrict pos_y,
1219  const double * __restrict pos_z,
1220  double * __restrict velNew_x,
1221  double * __restrict velNew_y,
1222  double * __restrict velNew_z,
1223  double * __restrict posNew_x,
1224  double * __restrict posNew_y,
1225  double * __restrict posNew_z
1226  ){
1227  int order;
1228  int stride = blockDim.x;
1229  int pid = patchIDs[blockIdx.x];
1230 
1231  int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1232  int b_forceOffset = b_pr[patchID].atomStart;
1233 
1234  int patchOffset = patchOffsets[blockIdx.x];
1235  int natom = nbondIndexPerPatch.numAtoms;
1236 
1237  float4 f_b;
1238  float4 f_nb;
1239  float4 f_s;
1240  for(int i = 0; i < threadIdx.x; i < natom; i += stride){
1241  f_b = {0};
1242  if(T_DONBOND) f_nb = {0};
1243  if(T_DOSLOW) f_s = {0};
1244  // fetch the bonded forces first
1245  f_b.x = f_bond[b_forceOffset + i];
1246  f_b.y = f_bond[b_forceOffset + i + forceStride];
1247  f_b.z = f_bond[b_forceOffset + i + 2*forceStride];
1248 
1249  if(T_DONBOND){
1250  f_nb = f_nbond[globalForceOffset + i];
1251  f_nb.x += f_bond_nbond[b_forceOffset + i];
1252  f_nb.y += f_bond_nbond[b_forceOffset + i + forceStride];
1253  f_nb.z += f_bond_nbond[b_forceOffset + i + 2*forceStride];
1254  }
1255  // addForceToMomentum now
1256  // this striding is not good
1257  float vx = vel_x[patchOffset + i];
1258  float vy = vel_y[patchOffset + i];
1259  float vz = vel_z[patchOffset + i];
1260  float rmass = recipMass[patchOffset + i];
1261  switch(maxForceNumber){
1262  // XXX TODO: Case 2 for slow forces
1263  case 1:
1264  dt_nbond *= scaling;
1265  vx += f_nb.x * rmass * dt_nbond;
1266  vy += f_nb.y * rmass * dt_nbond;
1267  vz += f_nb.z * rmass * dt_nbond;
1268  case 0:
1269  dt_normal *= scaling;
1270  vx += f_b.x * rmass * dt_nbond;
1271  vy += f_b.y * rmass * dt_nbond;
1272  vz += f_b.z * rmass * dt_nbond;
1273  }
1274  // that's it
1275  // updateRigidArrays
1276 
1277  posNew_x[patchOffset + i] = pos_x[i] + (vx * dt);
1278  posNew_y[patchOffset + i] = pos_y[i] + (vy * dt);
1279  posNew_z[patchOffset + i] = pos_z[i] + (vz * dt);
1280  vel_x[patchOffset + i] = vx;
1281  vel_y[patchOffset + i] = vy;
1282  vel_z[patchOffset + i] = vz;
1283  velNew_x[patchOffset + i] = vx;
1284  velNew_y[patchOffset + i] = vy;
1285  velNew_z[patchOffset + i] = vz;
1286  f_normal_x[patchOffset + i] = f_b.x;
1287  f_normal_y[patchOffset + i] = f_b.y;
1288  f_normal_z[patchOffset + i] = f_b.z;
1289 
1290  if(T_DONBOND){
1291  order = patchSortOrder[patchOffset + i];
1292  f_nbond_x[patchOffset + order] = f_nb.x;
1293  f_nbond_y[patchOffset + order] = f_nb.y;
1294  f_nbond_z[patchOffset + order] = f_nb.z;
1295  }
1296  }
1297 }
1298 
1299 void SequencerCUDAKernel::fetchForcesAndUpdateVelocities(int numPatches,
1300  const bool doNbond,
1301  const bool doSlow,
1302  const double scaling,
1303  double dt_normal,
1304  double dt_nbond,
1305  double dt_slow,
1306  const int maxForceNumber,
1307  /* copyFbondToSOA parameters*/
1308  const double* __restrict__ f_bond, // forces from bond compute
1309  const double* __restrict__ f_bond_nbond, // forces from bond compute
1310  double* f_bond_x,
1311  double* f_bond_y,
1312  double* f_bond_z,
1313  double* f_nbond_x,
1314  double* f_nbond_y,
1315  double* f_nbond_z,
1316  double* f_slow_x,
1317  double* f_slow_y,
1318  double* f_slow_z,
1319  int forceStride,
1320  const PatchRecord* b_pr,
1321  const int* patchIDs,
1322  const int* patchOffsets,
1323  /* copyFNbondToSOA parameters*/
1324  const float4* f_nbond, // forces from bbond compute
1325  const float4* f_nbond_slow, // forces from bond compute
1326  const int* patchSortOrder,
1327  const CudaPatchRecord* nbondIndexPerPatch,
1328  /* copyFSlowToSOA */
1329  const CudaForce* f_slow,
1330  const int* patchPositions,
1331  const int* pencilPatchIndex,
1332  const int* patchOffsets,
1333  const int* patchIDs,
1334  const int* slow_patchIDs,
1335  const CudaLattice* lattices,
1336  /* addForceToMomentum */
1337  const double * recipMass,
1338  double * vel_x,
1339  double * vel_y,
1340  double * vel_z,
1341  /* updateRigidArrays */
1342  const double * pos_x,
1343  const double * pos_y,
1344  const double * pos_z,
1345  double * velNew_x,
1346  double * velNew_y,
1347  double * velNew_z,
1348  double * posNew_x,
1349  double * posNew_y,
1350  double * posNew_z,
1351  cudaStream_t stream){
1352 
1353  // reduce the amount of arguments in this function
1354  int blocks = 128;
1355  int grid = numPatches;
1356 
1357  // XXX TODO finish this
1358  if(doNbond){
1359  if(doSlow){
1360  fetchForcesAndUpdateVelocities<true, true><<<numPatches, 128, 0, stream>>>();
1361  }else{
1362  fetchForcesAndUpdateVelocities<true, false><<<numPatches, 128, 0, stream>>>();
1363  }
1364  }else{
1365  fetchForcesAndUpdateVelocities<false, false><<<numPatches, 128, 0, stream>>>();
1366  }
1367 }
1368 #endif
1369 
1370 // JM: I need a function to do the pairlistCheck
1371 template<bool T_ISPERIODIC>
1372 __global__ void pairListMarginCheckKernel(
1373  const int numPatches,
1374  CudaLocalRecord* localRecords,
1375  const double* __restrict pos_x,
1376  const double* __restrict pos_y,
1377  const double* __restrict pos_z,
1378  const double* __restrict pos_old_x,
1379  const double* __restrict pos_old_y,
1380  const double* __restrict pos_old_z,
1381  const double3* __restrict awayDists, // for margin check
1382  const Lattice lattice,
1383  const Lattice latticeOld,
1384  const double3* __restrict patchMins,
1385  const double3* __restrict patchMaxes,
1386  const double3* __restrict patchCenter,
1387  const CudaMInfo* __restrict mInfo,
1388  unsigned int* __restrict tbcatomic,
1389  double* patchMaxAtomMovement,
1390  double* h_patchMaxAtomMovement,
1391  double* patchNewTolerance,
1392  double* h_patchNewTolerance,
1393  const double minSize,
1394  const double cutoff,
1395  const double sysdima,
1396  const double sysdimb,
1397  const double sysdimc,
1398  unsigned int* __restrict h_marginViolations,
1399  unsigned int* __restrict h_periodicCellSmall,
1400  const bool rescalePairlistTolerance
1401 ) {
1402  __shared__ CudaLocalRecord s_record;
1403  using AccessType = int32_t;
1404  AccessType* s_record_buffer = (AccessType*) &s_record;
1405 
1406  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1407  // Read in the CudaLocalRecord using multiple threads. This should
1408  #pragma unroll 1
1409  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1410  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1411  }
1412  __syncthreads();
1413 
1414  const int pid = s_record.patchID;
1415 
1416 
1417  // So each patch has a max_CD particular to it
1418  // since we have a block per patch, we get the stuff here
1419  // we need to check eight points for each patch. Might not pay off to do that
1420  // we do that once per patch I guess
1421  double cd, max_cd, max_pd;
1422  __shared__ double minx, miny, minz, maxx, maxy, maxz;
1423  double3 corner, corner_unscaled, corner2_unscaled;
1424  double3 mValue;
1425  double3 s;
1426 
1427  const int stride = blockDim.x;
1428 
1429  const int ind = patchIndex;
1430  double3 min = patchMins[ind];
1431  double3 max = patchMaxes[ind];
1432  double3 aDist = awayDists[ind];
1433 
1434  const int start = s_record.bufferOffset;
1435  const int numAtoms = s_record.numAtoms;
1436  double3 center = patchCenter[ind];
1437 
1438  double3 center_cur = lattice.unscale(center);
1439  double3 center_old = latticeOld.unscale(center);
1440 
1441  center_cur.x -= center_old.x;
1442  center_cur.y -= center_old.y;
1443  center_cur.z -= center_old.z;
1444 
1445  max_cd = 0.f;
1446  max_pd = 0.f;
1447  if(threadIdx.x == 0){
1448  if (rescalePairlistTolerance) patchNewTolerance[ind] *= (1.0f - c_pairlistShrink);
1449 
1450  // doMargincheck -> checks if the periodic cell has became too small:
1451 
1452  // Those values are in registers to begin with, so the overhead here is not that large
1453  // if this condition is set to true, simulation will stop
1454  mValue.x = (0.5) * (aDist.x - cutoff / sysdima);
1455  mValue.y = (0.5) * (aDist.y - cutoff / sysdimb);
1456  mValue.z = (0.5) * (aDist.z - cutoff / sysdimc);
1457  minx = min.x - mValue.x;
1458  miny = min.y - mValue.y;
1459  minz = min.z - mValue.z;
1460  maxx = max.x + mValue.x;
1461  maxy = max.y + mValue.y;
1462  maxz = max.z + mValue.z;
1463 
1464  for(int i = 0; i < 2; i++){
1465  for(int j =0 ; j< 2; j++){
1466  for(int k = 0; k < 2; k++){
1467  corner.x = (i ? min.x : max.x);
1468  corner.y = (j ? min.y : max.y);
1469  corner.z = (k ? min.z : max.z);
1470  corner_unscaled = lattice.unscale(corner);
1471  corner2_unscaled = latticeOld.unscale(corner);
1472  corner.x = (corner_unscaled.x - corner2_unscaled.x) - center_cur.x;
1473  corner.y = (corner_unscaled.y - corner2_unscaled.y) - center_cur.y;
1474  corner.z = (corner_unscaled.z - corner2_unscaled.z) - center_cur.z;
1475  cd = corner.x * corner.x +
1476  corner.y * corner.y + corner.z * corner.z;
1477  if (cd > max_cd) max_cd = cd;
1478  }
1479  }
1480  }
1481  }
1482  __syncthreads();
1483 
1484  // Alrights, so we have max_cd for each patch, now we get the max_pd
1485  // XXX TODO: Get the number of atoms per patch
1486  // we need the nAtomsPerPatch
1487  // we need an atomStart here
1488  unsigned int mc = 0;
1489  for(int i = threadIdx.x; i < numAtoms; i += stride){
1490  // Here I can also check for margin violations - we need the aAwayDists
1491  double3 pos;
1492  pos.x = pos_x[start + i];
1493  pos.y = pos_y[start + i];
1494  pos.z = pos_z[start + i];
1495  s = lattice.scale(pos);
1496  if(T_ISPERIODIC){
1497  // This is true if the system in periodic in A, B and C
1498  // if any of these clauses are true, atom needs to migrate
1499  mc += ((s.x < minx || s.x >= maxx) || (s.y < miny || s.y >= maxy) || (s.z < minz || s.z >= maxz)) ? 1 : 0;
1500  }else{
1501  int xdev, ydev, zdev;
1502  // Ok, so if the system is not periodic, we need to access the mInfo data structure to determine migrations
1503  if (s.x < minx) xdev = 0;
1504  else if (maxx <= s.x) xdev = 2;
1505  else xdev = 1;
1506 
1507  if (s.y < miny) ydev = 0;
1508  else if (maxy <= s.y) ydev = 2;
1509  else ydev = 1;
1510 
1511  if (s.z < minz) zdev = 0;
1512  else if (maxz <= s.z) zdev = 2;
1513  else zdev = 1;
1514 
1515  if(xdev != 1 || ydev != 1 || zdev != 1){
1516  // we check if any *dev are different than zero to prevent this horrible global memory access here
1517  int destPatch = mInfo[ind].destPatchID[xdev][ydev][zdev];
1518  if(destPatch != -1 && destPatch != pid) mc += 1; // atom needs to migrate
1519  }
1520  }
1521  corner.x = (pos.x - pos_old_x[start + i]) - center_cur.x;
1522  corner.y = (pos.y - pos_old_y[start + i]) - center_cur.y;
1523  corner.z = (pos.z - pos_old_z[start + i]) - center_cur.z;
1524  cd = corner.x * corner.x +
1525  corner.y * corner.y + corner.z * corner.z;
1526  if(cd > max_pd) max_pd = cd;
1527  }
1528  // JM NOTE: The atomic add to host memory is bad, but if you have margin violations the simulation is going badly
1529  // to begin with
1530  if (mc != 0) {
1531  atomicAdd(h_marginViolations, mc);
1532  }
1533 
1534  typedef cub::BlockReduce<BigReal, PATCH_BLOCKS> BlockReduce;
1535  __shared__ typename BlockReduce::TempStorage temp_storage;
1536  max_pd = BlockReduce(temp_storage).Reduce(max_pd, cub::Max());
1537  if(threadIdx.x == 0){
1538  max_pd = sqrt(max_cd) + sqrt(max_pd);
1539  // this might not be needed since I'm updating host-mapped values anyways
1540  patchMaxAtomMovement[ind] = max_pd;
1541  if(max_pd > (1.f - c_pairlistTrigger) * patchNewTolerance[ind]){
1542  patchNewTolerance[ind] *= (1.f + c_pairlistGrow);
1543  }
1544  if(max_pd > patchNewTolerance[ind]){
1545  patchNewTolerance[ind] = max_pd / (1.f - c_pairlistTrigger);
1546  }
1547  // printf("patchNewTolerance[%d] = %lf %lf\n", ind,
1548  // patchNewTolerance[ind], patchMaxAtomMovement[ind]);
1549  h_patchMaxAtomMovement[ind] = max_pd; // Host-mapped update
1550  h_patchNewTolerance[ind] = patchNewTolerance[ind];
1551  }
1552 
1553  // Checks if periodic cell has become too small and flags it
1554  if(threadIdx.x == 0){
1555  if ( ( aDist.x*sysdima < minSize*0.9999 ) ||
1556  ( aDist.y*sysdimb < minSize*0.9999 ) ||
1557  ( aDist.z*sysdimc < minSize*0.9999 ) ||
1558  ( mValue.x < -0.0001 ) ||
1559  ( mValue.y < -0.0001 ) ||
1560  ( mValue.z < -0.0001)) {
1561  *h_periodicCellSmall = 1;
1562  }
1563  }
1564  }
1565 }
1566 
1567 void SequencerCUDAKernel::PairListMarginCheck(const int numPatches,
1568  CudaLocalRecord* localRecords,
1569  const double* pos_x,
1570  const double* pos_y,
1571  const double* pos_z,
1572  const double* pos_old_x,
1573  const double* pos_old_y,
1574  const double* pos_old_z,
1575  const double3* awayDists, // for margin check
1576  const Lattice lattices,
1577  const Lattice lattices_old,
1578  const double3* __restrict patchMins,
1579  const double3* __restrict patchMaxes,
1580  const double3* __restrict patchCenter,
1581  const CudaMInfo* __restrict mInfo,
1582  unsigned int* __restrict tbcatomic,
1583  const double pairlistTrigger,
1584  const double pairlistGrow,
1585  const double pairlistShrink,
1586  double* __restrict patchMaxAtomMovement,
1587  double* __restrict h_patchMaxAtomMovement,
1588  double* __restrict patchNewTolerance,
1589  double* __restrict h_patchNewTolerance,
1590  const double minSize,
1591  const double cutoff,
1592  const double sysdima,
1593  const double sysdimb,
1594  const double sysdimc,
1595  unsigned int* h_marginViolations,
1596  unsigned int* h_periodicCellSmall,
1597  const bool rescalePairlistTolerance,
1598  const bool isPeriodic,
1599  cudaStream_t stream){
1600  int grid = numPatches;
1601 
1602  if(!this->intConstInit){
1603  this->intConstInit = true;
1604  cudaCheck(cudaMemcpyToSymbol(c_pairlistTrigger, &pairlistTrigger, sizeof(double)));
1605  cudaCheck(cudaMemcpyToSymbol(c_pairlistGrow, &pairlistGrow, sizeof(double)));
1606  cudaCheck(cudaMemcpyToSymbol(c_pairlistShrink, &pairlistShrink, sizeof(double)));
1607  }
1608 
1609  if(isPeriodic){
1610  pairListMarginCheckKernel<true><<<grid, PATCH_BLOCKS, 0, stream>>>(
1611  numPatches, localRecords,
1612  pos_x, pos_y, pos_z, pos_old_x,
1613  pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1614  patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1615  patchMaxAtomMovement, h_patchMaxAtomMovement,
1616  patchNewTolerance, h_patchNewTolerance,
1617  minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1618  rescalePairlistTolerance);
1619  }
1620  else{
1621  pairListMarginCheckKernel<false><<<grid, PATCH_BLOCKS, 0, stream>>>(
1622  numPatches, localRecords,
1623  pos_x, pos_y, pos_z, pos_old_x,
1624  pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1625  patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1626  patchMaxAtomMovement, h_patchMaxAtomMovement,
1627  patchNewTolerance, h_patchNewTolerance,
1628  minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1629  rescalePairlistTolerance);
1630  }
1631 }
1632 
1633 template <bool doNbond, bool doSlow>
1634 __global__ void copyFBondToSOA(double *f_bond,
1635  double *f_bond_nbond,
1636  double *f_bond_slow,
1637  double *f_bond_x,
1638  double *f_bond_y,
1639  double *f_bond_z,
1640  double *f_nbond_x,
1641  double *f_nbond_y,
1642  double *f_nbond_z,
1643  double *f_slow_x,
1644  double *f_slow_y,
1645  double *f_slow_z,
1646  const int forceStride,
1647  const PatchRecord *pr,
1648  const int *patchIDs,
1649  const int *patchOffsets)
1650 {
1651  // I suppose if I work with the entire PatchMap, this should work?
1652  // Same thing, each block gets a patch
1653  // What do I need -> Forces + atomStart
1654  // the bonded forces are wrong here.
1655  // What isforceOffset and patchOffset?
1656  int stride = blockDim.x;
1657  int patchID = patchIDs[blockIdx.x];
1658  // we need to check this data structure first to make sure it is safe to access it using patchID
1659  // I think patchId is the correct way of indexing this datastructure. Does this change with +p?
1660  int natoms = pr[patchID].numAtoms;
1661  int forceOffset = pr[patchID].atomStart;
1662  int patchOffset = patchOffsets[blockIdx.x];
1663 
1664  for(int i = threadIdx.x; i < natoms; i+=stride){
1665  // LOCAL GLOBAL
1666  f_bond_x[patchOffset + i] = f_bond[forceOffset + i];
1667  f_bond_y[patchOffset + i] = f_bond[forceOffset + i + forceStride];
1668  f_bond_z[patchOffset + i] = f_bond[forceOffset + i + 2*forceStride];
1669 
1670  if(doNbond){
1671  // XXX Accumulation here works because each f_nbond_* buffer
1672  // is first initialized in copyFNBondToSOA.
1673  f_nbond_x[patchOffset + i] += f_bond_nbond[forceOffset + i];
1674  f_nbond_y[patchOffset + i] += f_bond_nbond[forceOffset + i + forceStride];
1675  f_nbond_z[patchOffset + i] += f_bond_nbond[forceOffset + i + 2*forceStride];
1676  }
1677 
1678  if(doSlow){
1679  // XXX Accumulation here works because each f_slow_* buffer
1680  // is first initialized in copyFSlowToSOA.
1681  f_slow_x[patchOffset + i] += f_bond_slow[forceOffset + i];
1682  f_slow_y[patchOffset + i] += f_bond_slow[forceOffset + i + forceStride];
1683  f_slow_z[patchOffset + i] += f_bond_slow[forceOffset + i + 2*forceStride];
1684  }
1685  }
1686 }
1687 
1688 void SequencerCUDAKernel::copy_bond_forces(int numPatches,
1689  double* f_bond,
1690  double* f_bond_nbond,
1691  double* f_bond_slow,
1692  double* f_bond_x,
1693  double* f_bond_y,
1694  double* f_bond_z,
1695  double* f_nbond_x,
1696  double* f_nbond_y,
1697  double* f_nbond_z,
1698  double* f_slow_x,
1699  double* f_slow_y,
1700  double* f_slow_z,
1701  int forceStride, //if stridedForces
1702  PatchRecord* pr,
1703  const int* patchIDs,
1704  const int* patchOffsets,
1705  bool doNbond,
1706  bool doSlow,
1707  cudaStream_t stream)
1708 {
1709  if(doSlow){
1710  copyFBondToSOA<true, true><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1711  f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1712  f_nbond_x, f_nbond_y, f_nbond_z,
1713  f_slow_x, f_slow_y, f_slow_z, forceStride,
1714  pr, patchIDs, patchOffsets);
1715  }else if(doNbond){
1716  copyFBondToSOA<true, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1717  f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1718  f_nbond_x, f_nbond_y, f_nbond_z,
1719  f_slow_x, f_slow_y, f_slow_z, forceStride,
1720  pr, patchIDs, patchOffsets);
1721  }else{
1722  copyFBondToSOA<false, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1723  f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1724  f_nbond_x, f_nbond_y, f_nbond_z,
1725  f_slow_x, f_slow_y, f_slow_z, forceStride,
1726  pr, patchIDs, patchOffsets);
1727  }
1728 }
1729 
1730 __global__ void copyFSlowToSOA(const CudaForce* __restrict__ f_slow,
1731  double* f_slow_x,
1732  double* f_slow_y,
1733  double* f_slow_z,
1734  const int* __restrict__ patchOffsets,
1735  const Lattice* __restrict__ lattices)
1736 {
1737  int tid = threadIdx.x;
1738  int stride = blockDim.x;
1739  int patchOffset = patchOffsets[blockIdx.x];
1740  int numAtoms = patchOffsets[blockIdx.x + 1] - patchOffset;
1741 
1742  Lattice lat = lattices[0];
1743 
1744  for(int i = tid; i < numAtoms; i += stride){
1745  CudaForce f = f_slow[patchOffset + i];
1746 
1747  double3 f_scaled = lat.scale_force(
1748  Vector((double)f.x, (double)f.y, (double)f.z));
1749 
1750  // XXX Instead of accumulating slow force (+=), set them here (=)!
1751  f_slow_x[patchOffset + i] = f_scaled.x;
1752  f_slow_y[patchOffset + i] = f_scaled.y;
1753  f_slow_z[patchOffset + i] = f_scaled.z;
1754  }
1755 }
1756 
1757 void SequencerCUDAKernel::copy_slow_forces(int numPatches,
1758  const CudaForce* f_slow,
1759  double* f_slow_x,
1760  double* f_slow_y,
1761  double* f_slow_z,
1762  const int* d_patchOffsets,
1763  const Lattice *lattices,
1764  cudaStream_t stream){
1765 
1766  copyFSlowToSOA<<<numPatches, PATCH_BLOCKS, 0, stream>>>(f_slow,
1767  f_slow_x, f_slow_y, f_slow_z, d_patchOffsets, lattices);
1768 }
1769 
1770 __forceinline__ __device__ void zero_cudaTensor(cudaTensor &v)
1771 {
1772  v.xx = 0.0;
1773  v.xy = 0.0;
1774  v.xz = 0.0;
1775  v.yx = 0.0;
1776  v.yy = 0.0;
1777  v.yz = 0.0;
1778  v.zx = 0.0;
1779  v.zy = 0.0;
1780  v.zz = 0.0;
1781 }
1782 
1783 template <int MAX_FORCE_NUMBER, bool DO_VEL_RESCALING, bool DO_FIXED>
1784 __global__ void addForceToMomentumKernel(const double scaling,
1785  double dt_normal,
1786  double dt_nbond,
1787  double dt_slow,
1788  double velrescaling, // for stochastic velocity rescaling
1789  const double * __restrict recipMass,
1790  const double * __restrict f_normal_x,
1791  const double * __restrict f_normal_y,
1792  const double * __restrict f_normal_z,
1793  const double * __restrict f_nbond_x,
1794  const double * __restrict f_nbond_y,
1795  const double * __restrict f_nbond_z,
1796  const double * __restrict f_slow_x,
1797  const double * __restrict f_slow_y,
1798  const double * __restrict f_slow_z,
1799  double * __restrict vel_x,
1800  double * __restrict vel_y,
1801  double * __restrict vel_z,
1802  const int * __restrict atomFixed,
1803  const int numAtoms)
1804 {
1805  int i = threadIdx.x + blockIdx.x*blockDim.x;
1806  if (i < numAtoms) {
1807 #if 0
1808  switch (maxForceNumber) {
1809  case 2:
1810  dt_slow *= scaling;
1811  vel_x[i] += f_slow_x[i] * recipMass[i] * dt_slow;
1812  vel_y[i] += f_slow_y[i] * recipMass[i] * dt_slow;
1813  vel_z[i] += f_slow_z[i] * recipMass[i] * dt_slow;
1814  // fall through because we will always have the "faster" forces
1815  case 1:
1816  dt_nbond *= scaling;
1817  vel_x[i] += f_nbond_x[i] * recipMass[i] * dt_nbond;
1818  vel_y[i] += f_nbond_y[i] * recipMass[i] * dt_nbond;
1819  vel_z[i] += f_nbond_z[i] * recipMass[i] * dt_nbond;
1820  // fall through because we will always have the "faster" forces
1821  case 0:
1822  dt_normal *= scaling;
1823  vel_x[i] += f_normal_x[i] * recipMass[i] * dt_normal;
1824  vel_y[i] += f_normal_y[i] * recipMass[i] * dt_normal;
1825  vel_z[i] += f_normal_z[i] * recipMass[i] * dt_normal;
1826  }
1827 #else
1828  double vx = 0;
1829  double vy = 0;
1830  double vz = 0;
1831  switch (MAX_FORCE_NUMBER) {
1832  case 2:
1833  vx += f_slow_x[i] * dt_slow;
1834  vy += f_slow_y[i] * dt_slow;
1835  vz += f_slow_z[i] * dt_slow;
1836  // fall through because we will always have the "faster" forces
1837  case 1:
1838  vx += f_nbond_x[i] * dt_nbond;
1839  vy += f_nbond_y[i] * dt_nbond;
1840  vz += f_nbond_z[i] * dt_nbond;
1841  // fall through because we will always have the "faster" forces
1842  case 0:
1843  vx += f_normal_x[i] * dt_normal;
1844  vy += f_normal_y[i] * dt_normal;
1845  vz += f_normal_z[i] * dt_normal;
1846  }
1847  vx *= scaling * recipMass[i];
1848  vy *= scaling * recipMass[i];
1849  vz *= scaling * recipMass[i];
1850  if (!DO_FIXED) {
1851  if (DO_VEL_RESCALING) {
1852  vel_x[i] = velrescaling*vel_x[i] + vx;
1853  vel_y[i] = velrescaling*vel_y[i] + vy;
1854  vel_z[i] = velrescaling*vel_z[i] + vz;
1855  }
1856  else {
1857  vel_x[i] += vx;
1858  vel_y[i] += vy;
1859  vel_z[i] += vz;
1860  }
1861  } else { // fixed atoms
1862  if (!atomFixed[i]) {
1863  if (DO_VEL_RESCALING) {
1864  vel_x[i] = velrescaling*vel_x[i] + vx;
1865  vel_y[i] = velrescaling*vel_y[i] + vy;
1866  vel_z[i] = velrescaling*vel_z[i] + vz;
1867  }
1868  else {
1869  vel_x[i] += vx;
1870  vel_y[i] += vy;
1871  vel_z[i] += vz;
1872  }
1873  } else {
1874 
1875  vel_x[i] = 0;
1876  vel_y[i] = 0;
1877  vel_z[i] = 0;
1878  }
1879  }
1880 #endif
1881  }
1882 }
1883 
1884 // JM: This sets the cudaAtom vector from the nonbonded compute
1885 template <bool t_doNbond, bool t_doHomePatches>
1886 __global__ void setComputePositionsKernel(
1887  CudaLocalRecord* localRecords,
1888  CudaPeerRecord* peerRecords,
1889  const double3* __restrict patchCenter,
1890  const int* __restrict patchSortOrder,
1891  const int numPatches,
1892  const unsigned int devID,
1893  const Lattice lat,
1894  const double charge_scaling,
1895  const double* __restrict d_pos_x,
1896  const double* __restrict d_pos_y,
1897  const double* __restrict d_pos_z,
1898  const float* __restrict charges,
1899  double** __restrict d_peer_pos_x,
1900  double** __restrict d_peer_pos_y,
1901  double** __restrict d_peer_pos_z,
1902  float4* __restrict nb_atoms,
1903  float4* __restrict b_atoms,
1904  float4* __restrict s_atoms
1905 ) {
1906 
1907  __shared__ CudaLocalRecord s_record;
1908  using AccessType = int32_t;
1909  AccessType* s_record_buffer = (AccessType*) &s_record;
1910 
1911  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1912  // Read in the CudaLocalRecord using multiple threads
1913  #pragma unroll 1
1914  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1915  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1916  }
1917  __syncthreads();
1918 
1919  double3 center;
1920  //int soapid = globalToLocalID[record.patchID];
1921  center = patchCenter[patchIndex];
1922  double3 ucenter = lat.unscale(center);
1923 
1924  const int numAtoms = s_record.numAtoms;
1925  const int dstOffset = s_record.bufferOffset;
1926  const int dstOffsetNB = s_record.bufferOffsetNBPad;
1927 
1928  int srcDevice, srcOffset;
1929  const double *srcPosX, *srcPosY, *srcPosZ;
1930  if (t_doHomePatches) {
1931  srcDevice = devID;
1932  srcOffset = dstOffset;
1933  srcPosX = d_pos_x;
1934  srcPosY = d_pos_y;
1935  srcPosZ = d_pos_z;
1936  } else {
1937  srcDevice = s_record.inline_peers[0].deviceIndex;
1938  srcOffset = s_record.inline_peers[0].bufferOffset;
1939  srcPosX = d_peer_pos_x[srcDevice];
1940  srcPosY = d_peer_pos_y[srcDevice];
1941  srcPosZ = d_peer_pos_z[srcDevice];
1942  }
1943 
1944  float4 atom;
1945  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1946  const int order = patchSortOrder[dstOffset + i]; // Should this be order or unorder?
1947  atom.x = srcPosX[srcOffset + order] - ucenter.x;
1948  atom.y = srcPosY[srcOffset + order] - ucenter.y;
1949  atom.z = srcPosZ[srcOffset + order] - ucenter.z;
1950  atom.w = charges[dstOffset + order] * charge_scaling;
1951 
1952  b_atoms[dstOffset + order] = atom;
1953  if (t_doNbond) nb_atoms[dstOffsetNB + i] = atom;
1954  }
1955 
1956  if (t_doNbond) {
1957  if (threadIdx.x / WARPSIZE == 0) {
1958  const int to_write = (((numAtoms+32-1)/32)*32) - numAtoms; // WARPSIZE
1959 
1960  float4 lastAtom;
1961  const int order = patchSortOrder[dstOffset + numAtoms - 1]; // Should this be order or unorder?
1962  lastAtom.x = srcPosX[srcOffset + order] - ucenter.x;
1963  lastAtom.y = srcPosY[srcOffset + order] - ucenter.y;
1964  lastAtom.z = srcPosZ[srcOffset + order] - ucenter.z;
1965  lastAtom.w = charges[dstOffset + order] * charge_scaling;
1966 
1967  if (threadIdx.x < to_write) {
1968  nb_atoms[dstOffsetNB+numAtoms+threadIdx.x] = lastAtom;
1969  }
1970  }
1971  }
1972  __syncthreads();
1973  }
1974 }
1975 
1976 template<bool t_doHomePatches, bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
1977 __global__ void setComputePositionsKernel_PME (
1978  const double* __restrict d_pos_x,
1979  const double* __restrict d_pos_y,
1980  const double* __restrict d_pos_z,
1981  const float* __restrict charges,
1982  double** __restrict d_peer_pos_x,
1983  double** __restrict d_peer_pos_y,
1984  double** __restrict d_peer_pos_z,
1985  float** __restrict d_peer_charge,
1986  int** __restrict d_peer_partition,
1987  const int* __restrict partition,
1988  const double charge_scaling,
1989  const int* __restrict s_patchPositions,
1990  const int* __restrict s_pencilPatchIndex,
1991  const int* __restrict s_patchIDs,
1992  const Lattice lat,
1993  float4* __restrict s_atoms,
1994  int numTotalAtoms,
1995  int* __restrict s_partition,
1996 // NEW INPUT
1997  const int peerDevice,
1998  const int numAtoms,
1999  const int offset,
2000  const bool handleBoundary
2001 ) {
2002 
2003  float4 foo;
2004 
2005  const int pmeBufferOffset = offset;
2006  const int srcBufferOffset = 0;
2007  const int srcDevice = peerDevice;
2008 
2009  // double precision calculation
2010  double px, py, pz, wx, wy, wz, q;
2011  int part;
2012  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numAtoms; i += blockDim.x * gridDim.x) {
2013  // this gets atoms in same order as PmeCuda code path
2014  if (t_doHomePatches) {
2015  q = (double)(charges[srcBufferOffset + i]);
2016  px = d_pos_x[srcBufferOffset + i];
2017  py = d_pos_y[srcBufferOffset + i];
2018  pz = d_pos_z[srcBufferOffset + i];
2019  if (t_doFEP || t_doTI) {
2020  part = partition[srcBufferOffset + i];
2021  }
2022  } else {
2023  q = (double)(d_peer_charge[srcDevice][srcBufferOffset + i]);
2024  px = d_peer_pos_x[srcDevice][srcBufferOffset + i];
2025  py = d_peer_pos_y[srcDevice][srcBufferOffset + i];
2026  pz = d_peer_pos_z[srcDevice][srcBufferOffset + i];
2027  if (t_doFEP || t_doTI) {
2028  part = d_peer_partition[srcDevice][srcBufferOffset + i];
2029  }
2030  }
2031 
2032  double3 w_vec = lat.scale(Vector(px, py, pz));
2033  wx = w_vec.x;
2034  wy = w_vec.y;
2035  wz = w_vec.z;
2036 
2037  if (handleBoundary) {
2038  wx = (wx - (floor(wx + 0.5) - 0.5));
2039  wy = (wy - (floor(wy + 0.5) - 0.5));
2040  wz = (wz - (floor(wz + 0.5) - 0.5));
2041  }
2042 
2043  foo.x = (float) wx;
2044  foo.y = (float) wy;
2045  foo.z = (float) wz;
2046 
2047  if (handleBoundary) {
2048  foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2049  foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2050  foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2051  }
2052 
2053  foo.w = (float) (charge_scaling * q);
2054  if (!t_doFEP && !t_doTI) {
2055  s_atoms[pmeBufferOffset + i] = foo;
2056  }
2057  else { // alchemical multiple grids
2058  float4 foo_zero_charge = foo;
2059  foo_zero_charge.w = 0.0f;
2060  s_partition[pmeBufferOffset + i] = part;
2061  /* grid 0 grid 1 grid 2 grid 3 grid 4
2062  * non-alch (p = 0) Y Y N N Y
2063  * appearing (p = 1) Y N Y N N
2064  * disappearing (p = 2) N Y N Y N
2065  * Requirements of grids:
2066  * 1. t_doFEP || t_doTI : grid 0, grid 1
2067  * 2. t_doAlchDecouple : grid 2, grid 3
2068  * 3. t_doAlchSoftCore || t_doTI: grid 4
2069  * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2070  * although the atoms that have zero charges in extra grids would not change in non-migration steps,
2071  * I still find no way to get rid of these copying, because positions of the atoms can be changed.
2072  * The non-zero charges may also change if they are computed from some QM engines or some new kinds of FF.
2073  * It seems these branchings in non-migration steps are inevitable.
2074  */
2075  switch (part) {
2076  // non-alch atoms
2077  case 0: {
2078  s_atoms[pmeBufferOffset + i] = foo;
2079  s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2080  if (t_doAlchDecouple) {
2081  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2082  s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2083  if (t_doAlchSoftCore || t_doTI) {
2084  s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo;
2085  }
2086  } else {
2087  if (t_doAlchSoftCore || t_doTI) {
2088  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2089  }
2090  }
2091  break;
2092  }
2093  // appearing atoms
2094  case 1: {
2095  s_atoms[pmeBufferOffset + i] = foo;
2096  s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo_zero_charge;
2097  if (t_doAlchDecouple) {
2098  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2099  s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2100  if (t_doAlchSoftCore || t_doTI) {
2101  s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2102  }
2103  } else {
2104  if (t_doAlchSoftCore || t_doTI) {
2105  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2106  }
2107  }
2108  break;
2109  }
2110  // disappearing atoms
2111  case 2: {
2112  s_atoms[pmeBufferOffset + i] = foo_zero_charge;
2113  s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2114  if (t_doAlchDecouple) {
2115  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2116  s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo;
2117  if (t_doAlchSoftCore || t_doTI) {
2118  s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2119  }
2120  } else {
2121  if (t_doAlchSoftCore || t_doTI) {
2122  s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2123  }
2124  }
2125  break;
2126  }
2127  }
2128  }
2129  }
2130 }
2131 
2132 void SequencerCUDAKernel::set_compute_positions(
2133  const int devID,
2134  const bool isPmeDevice,
2135  const int nDev,
2136  const int numPatchesHomeAndProxy,
2137  const int numPatchesHome,
2138  const bool doNbond,
2139  const bool doSlow,
2140  const bool doFEP,
2141  const bool doTI,
2142  const bool doAlchDecouple,
2143  const bool doAlchSoftCore,
2144  const bool handleBoundary,
2145  const double* d_pos_x,
2146  const double* d_pos_y,
2147  const double* d_pos_z,
2148 #ifndef NAMD_NCCL_ALLREDUCE
2149  double** d_peer_pos_x,
2150  double** d_peer_pos_y,
2151  double** d_peer_pos_z,
2152  float** d_peer_charge,
2153  int** d_peer_partition,
2154 #endif
2155  const float* charges,
2156  const int* partition,
2157  const double charge_scaling,
2158  const double3* patchCenter,
2159  const int* s_patchPositions,
2160  const int* s_pencilPatchIndex,
2161  const int* s_patchIDs,
2162  const int* patchSortOrder,
2163  const Lattice lattice,
2164  float4* nb_atoms,
2165  float4* b_atoms,
2166  float4* s_atoms,
2167  int* s_partition,
2168  int numTotalAtoms,
2169  CudaLocalRecord* localRecords,
2170  CudaPeerRecord* peerRecords,
2171  std::vector<int>& atomCounts,
2172  cudaStream_t stream)
2173 {
2174  // Launch Local Set Compute Positions
2175  if(doNbond){
2176  setComputePositionsKernel<true, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2177  localRecords, peerRecords, patchCenter, patchSortOrder,
2178  numPatchesHome, devID, lattice, charge_scaling,
2179  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2180  nb_atoms, b_atoms, s_atoms
2181  );
2182  } else {
2183  setComputePositionsKernel<false, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2184  localRecords, peerRecords, patchCenter, patchSortOrder,
2185  numPatchesHome, devID, lattice, charge_scaling,
2186  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2187  nb_atoms, b_atoms, s_atoms
2188  );
2189  }
2190 
2191  // Launch Peer Set Computes
2192  if (nDev != 1) {
2193  const int numProxyPatches = numPatchesHomeAndProxy - numPatchesHome;
2194  if(doNbond){
2195  setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2196  localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2197  numProxyPatches, devID, lattice, charge_scaling,
2198  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2199  nb_atoms, b_atoms, s_atoms
2200  );
2201  } else {
2202  setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2203  localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2204  numProxyPatches, devID, lattice, charge_scaling,
2205  d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2206  nb_atoms, b_atoms, s_atoms
2207  );
2208  }
2209  }
2210 
2211  // Launch PME setComputes
2212  #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
2213  setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
2214  <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
2215  d_pos_x, d_pos_y, d_pos_z, charges, \
2216  d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
2217  d_peer_partition, partition, charge_scaling, \
2218  s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
2219  lattice, s_atoms, numTotalAtoms, s_partition, \
2220  i, numAtoms, offset, handleBoundary \
2221  );
2222  // Only when PME long-range electrostaic is enabled (doSlow is true)
2223  // The partition is needed for alchemical calculation.
2224  if(doSlow && isPmeDevice) {
2225  int offset = 0;
2226  for (int i = 0; i < nDev; i++) {
2227  const bool home = (i == devID);
2228  const int numAtoms = atomCounts[i];
2229  const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
2230  const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
2231 
2232  switch (options) {
2233  case 0: CALL(0, 0, 0, 0, 0); break;
2234  case 1: CALL(0, 0, 0, 0, 1); break;
2235  case 2: CALL(0, 0, 0, 1, 0); break;
2236  case 3: CALL(0, 0, 0, 1, 1); break;
2237  case 4: CALL(0, 0, 1, 0, 0); break;
2238  case 5: CALL(0, 0, 1, 0, 1); break;
2239  case 6: CALL(0, 0, 1, 1, 0); break;
2240  case 7: CALL(0, 0, 1, 1, 1); break;
2241  case 8: CALL(0, 1, 0, 0, 0); break;
2242  case 9: CALL(0, 1, 0, 0, 1); break;
2243  case 10: CALL(0, 1, 0, 1, 0); break;
2244  case 11: CALL(0, 1, 0, 1, 1); break;
2245  case 12: CALL(0, 1, 1, 0, 0); break;
2246  case 13: CALL(0, 1, 1, 0, 1); break;
2247  case 14: CALL(0, 1, 1, 1, 0); break;
2248  case 15: CALL(0, 1, 1, 1, 1); break;
2249  case 16: CALL(1, 0, 0, 0, 0); break;
2250  case 17: CALL(1, 0, 0, 0, 1); break;
2251  case 18: CALL(1, 0, 0, 1, 0); break;
2252  case 19: CALL(1, 0, 0, 1, 1); break;
2253  case 20: CALL(1, 0, 1, 0, 0); break;
2254  case 21: CALL(1, 0, 1, 0, 1); break;
2255  case 22: CALL(1, 0, 1, 1, 0); break;
2256  case 23: CALL(1, 0, 1, 1, 1); break;
2257  case 24: CALL(1, 1, 0, 0, 0); break;
2258  case 25: CALL(1, 1, 0, 0, 1); break;
2259  case 26: CALL(1, 1, 0, 1, 0); break;
2260  case 27: CALL(1, 1, 0, 1, 1); break;
2261  case 28: CALL(1, 1, 1, 0, 0); break;
2262  case 29: CALL(1, 1, 1, 0, 1); break;
2263  case 30: CALL(1, 1, 1, 1, 0); break;
2264  case 31: CALL(1, 1, 1, 1, 1); break;
2265  default:
2266  NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
2267  }
2268 
2269  offset += numAtoms;
2270  }
2271  }
2272  #undef CALL
2273 }
2274 
2275 template <bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
2276 __global__ void setPmePositionsKernel(
2277  double charge_scaling,
2278  const Lattice lattice,
2279  const double *pos_x,
2280  const double *pos_y,
2281  const double *pos_z,
2282  const float *charges,
2283  const int* partition,
2284  float4 *s_atoms,
2285  int* s_atoms_partition,
2286  int numAtoms)
2287 {
2288  int i = threadIdx.x + blockIdx.x*blockDim.x;
2289  if (i < numAtoms) {
2290  Lattice lat = lattice;
2291  float4 foo;
2292  double wx, wy, wz, q;
2293  q = (double)(charges[i]);
2294 
2295  double3 w_vec = lat.scale(Vector(pos_x[i], pos_y[i], pos_z[i]));
2296 
2297  wx = w_vec.x;
2298  wy = w_vec.y;
2299  wz = w_vec.z;
2300  wx = (wx - (floor(wx + 0.5) - 0.5));
2301  wy = (wy - (floor(wy + 0.5) - 0.5));
2302  wz = (wz - (floor(wz + 0.5) - 0.5));
2303  foo.x = (float) wx;
2304  foo.y = (float) wy;
2305  foo.z = (float) wz;
2306  foo.w = (float) (charge_scaling * q);
2307  foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2308  foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2309  foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2310  if (!t_doFEP && !t_doTI) {
2311  s_atoms[i] = foo;
2312  }
2313  else { // alchemical multiple grids
2314  float4 foo_zero_charge = foo;
2315  foo_zero_charge.w = 0.0f;
2316  s_atoms_partition[i] = partition[i];
2317  /* grid 0 grid 1 grid 2 grid 3 grid 4
2318  * non-alch (p = 0) Y Y N N Y
2319  * appearing (p = 1) Y N Y N N
2320  * disappearing (p = 2) N Y N Y N
2321  * Requirements of grids:
2322  * 1. t_doFEP || t_doTI : grid 0, grid 1
2323  * 2. t_doAlchDecouple : grid 2, grid 3
2324  * 3. t_doAlchSoftCore || t_doTI: grid 4
2325  * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2326  */
2327  switch (partition[i]) {
2328  // non-alch atoms
2329  case 0: {
2330  s_atoms[i] = foo;
2331  s_atoms[i + numAtoms] = foo;
2332  if (t_doAlchDecouple) {
2333  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2334  s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2335  if (t_doAlchSoftCore || t_doTI) {
2336  s_atoms[i + 4 * numAtoms] = foo;
2337  }
2338  } else {
2339  if (t_doAlchSoftCore || t_doTI) {
2340  s_atoms[i + 2 * numAtoms] = foo;
2341  }
2342  }
2343  break;
2344  }
2345  // appearing atoms
2346  case 1: {
2347  s_atoms[i] = foo;
2348  s_atoms[i + numAtoms] = foo_zero_charge;
2349  if (t_doAlchDecouple) {
2350  s_atoms[i + 2 * numAtoms] = foo;
2351  s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2352  if (t_doAlchSoftCore || t_doTI) {
2353  s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2354  }
2355  } else {
2356  if (t_doAlchSoftCore || t_doTI) {
2357  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2358  }
2359  }
2360  break;
2361  }
2362  // disappearing atoms
2363  case 2: {
2364  s_atoms[i] = foo_zero_charge;
2365  s_atoms[i + numAtoms] = foo;
2366  if (t_doAlchDecouple) {
2367  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2368  s_atoms[i + 3 * numAtoms] = foo;
2369  if (t_doAlchSoftCore || t_doTI) {
2370  s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2371  }
2372  } else {
2373  if (t_doAlchSoftCore || t_doTI) {
2374  s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2375  }
2376  }
2377  break;
2378  }
2379  }
2380  }
2381  }
2382 }
2383 
2384 __global__ void maximumMoveKernel(const double maxvel2,
2385  const double * __restrict vel_x,
2386  const double * __restrict vel_y,
2387  const double * __restrict vel_z,
2388  int *killme,
2389  const int numAtoms)
2390 {
2391  int i = threadIdx.x + blockIdx.x*blockDim.x;
2392  if (i < numAtoms) {
2393  double vel2 = vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i];
2394  if (vel2 > maxvel2) {
2395  //If this ever happens, we're already screwed, so performance does not matter
2396  atomicAdd(killme, 1);
2397  }
2398  }
2399 }
2400 
2401 void SequencerCUDAKernel::maximumMove(const double maxvel2,
2402  const double *vel_x,
2403  const double *vel_y,
2404  const double *vel_z,
2405  int *killme,
2406  const int numAtoms,
2407  cudaStream_t stream)
2408 {
2409  //cudaCheck(cudaMemset(killme, 0, sizeof(int)));
2410  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2411  maximumMoveKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
2412  maxvel2, vel_x, vel_y, vel_z, killme, numAtoms);
2413 }
2414 
2415 void SequencerCUDAKernel::addForceToMomentum(
2416  bool doFixedAtoms,
2417  const double scaling,
2418  double dt_normal,
2419  double dt_nbond,
2420  double dt_slow,
2421  double velrescaling, // for stochastic velocity rescaling
2422  const double *recipMass,
2423  const double *f_normal_x,
2424  const double *f_normal_y,
2425  const double *f_normal_z,
2426  const double *f_nbond_x,
2427  const double *f_nbond_y,
2428  const double *f_nbond_z,
2429  const double *f_slow_x,
2430  const double *f_slow_y,
2431  const double *f_slow_z,
2432  double *vel_x,
2433  double *vel_y,
2434  double *vel_z,
2435  const int *atomFixed,
2436  int numAtoms,
2437  int maxForceNumber,
2438  cudaStream_t stream)
2439 {
2440  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2441  const bool doVelRescaling = (velrescaling != 1.0);
2442  int kernelCalled = 0;
2443 #define CALL(MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED) \
2444  do { \
2445  addForceToMomentumKernel<MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED> \
2446  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
2447  scaling, dt_normal, dt_nbond, dt_slow, velrescaling, \
2448  recipMass, f_normal_x, f_normal_y, f_normal_z, \
2449  f_nbond_x, f_nbond_y, f_nbond_z, \
2450  f_slow_x, f_slow_y, f_slow_z, \
2451  vel_x, vel_y, vel_z, atomFixed, \
2452  numAtoms ); \
2453  kernelCalled++; \
2454  } while (0)
2455  switch (maxForceNumber) {
2456  case 2: {
2457  if (doVelRescaling && doFixedAtoms) CALL(2, true, true);
2458  if (doVelRescaling && !doFixedAtoms) CALL(2, true, false);
2459  if (!doVelRescaling && !doFixedAtoms) CALL(2, false, false);
2460  if (!doVelRescaling && doFixedAtoms) CALL(2, false, true);
2461  break;
2462  }
2463  case 1: {
2464  if (doVelRescaling && doFixedAtoms) CALL(1, true, true);
2465  if (doVelRescaling && !doFixedAtoms) CALL(1, true, false);
2466  if (!doVelRescaling && !doFixedAtoms) CALL(1, false, false);
2467  if (!doVelRescaling && doFixedAtoms) CALL(1, false, true);
2468  break;
2469  }
2470  case 0: {
2471  if (doVelRescaling && doFixedAtoms) CALL(0, true, true);
2472  if (doVelRescaling && !doFixedAtoms) CALL(0, true, false);
2473  if (!doVelRescaling && !doFixedAtoms) CALL(0, false, false);
2474  if (!doVelRescaling && doFixedAtoms) CALL(0, false, true);
2475  break;
2476  }
2477  }
2478 #undef CALL
2479  if (kernelCalled != 1) NAMD_bug("Incorrect number of calls to kernel in SequencerCUDAKernel::addForceToMomentum!\n");
2480  // cudaCheck(cudaGetLastError());
2481 }
2482 
2483 template <bool copyPos, bool DO_FIXED>
2484 __global__ void addVelocityToPositionKernel(
2485  const double dt,
2486  const int* __restrict atomFixed,
2487  const double * __restrict vel_x,
2488  const double * __restrict vel_y,
2489  const double * __restrict vel_z,
2490  double * __restrict pos_x,
2491  double * __restrict pos_y,
2492  double * __restrict pos_z,
2493  double * __restrict h_pos_x, // host-mapped vectors
2494  double * __restrict h_pos_y,
2495  double * __restrict h_pos_z,
2496  int numAtoms)
2497 {
2498  int i = threadIdx.x + blockIdx.x*blockDim.x;
2499  double x,y,z;
2500  if (i < numAtoms) {
2501  x = pos_x[i];
2502  y = pos_y[i];
2503  z = pos_z[i];
2504  if (DO_FIXED) {
2505  if (!atomFixed[i]) {
2506  x += vel_x[i] * dt;
2507  y += vel_y[i] * dt;
2508  z += vel_z[i] * dt;
2509  }
2510  } else {
2511  x += vel_x[i] * dt;
2512  y += vel_y[i] * dt;
2513  z += vel_z[i] * dt;
2514  }
2515  pos_x[i] = x;
2516  pos_y[i] = y;
2517  pos_z[i] = z;
2518  if(copyPos){
2519  h_pos_x[i] = x;
2520  h_pos_y[i] = y;
2521  h_pos_z[i] = z;
2522  }
2523  }
2524 }
2525 
2526 void SequencerCUDAKernel::addVelocityToPosition(
2527  bool doFixedAtoms,
2528  const double dt,
2529  const int *atomFixed,
2530  const double *vel_x,
2531  const double *vel_y,
2532  const double *vel_z,
2533  double *pos_x,
2534  double *pos_y,
2535  double *pos_z,
2536  double *h_pos_x,
2537  double *h_pos_y,
2538  double *h_pos_z,
2539  int numAtoms,
2540  bool copyPositions,
2541  cudaStream_t stream)
2542 {
2543  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2544 #define CALL(copyPos, DO_FIXED) \
2545  addVelocityToPositionKernel<copyPos, DO_FIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
2546  dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z, \
2547  h_pos_x, h_pos_y, h_pos_z, numAtoms)
2548  if ( copyPositions && doFixedAtoms) CALL(true, true);
2549  else if (!copyPositions && doFixedAtoms) CALL(false, true);
2550  else if ( copyPositions && !doFixedAtoms) CALL(true, false);
2551  else if (!copyPositions && !doFixedAtoms) CALL(false, false);
2552  else NAMD_bug("No kernel was called in SequencerCUDAKernel::addVelocityToPosition!\n");
2553 #undef CALL
2554  //cudaCheck(cudaGetLastError());
2555 }
2556 
2557 template <bool doFixed>
2558 __global__ void updateRigidArraysKernel(
2559  const double dt,
2560  const int * __restrict atomFixed,
2561  const double * __restrict vel_x,
2562  const double * __restrict vel_y,
2563  const double * __restrict vel_z,
2564  const double * __restrict pos_x,
2565  const double * __restrict pos_y,
2566  const double * __restrict pos_z,
2567  double * __restrict velNew_x,
2568  double * __restrict velNew_y,
2569  double * __restrict velNew_z,
2570  double * __restrict posNew_x,
2571  double * __restrict posNew_y,
2572  double * __restrict posNew_z,
2573  int numAtoms)
2574 {
2575  int i = threadIdx.x + blockIdx.x*blockDim.x;
2576  if (i < numAtoms) {
2577  double vx = vel_x[i];
2578  double vy = vel_y[i];
2579  double vz = vel_z[i];
2580  if (!doFixed) {
2581  posNew_x[i] = pos_x[i] + (vx * dt);
2582  posNew_y[i] = pos_y[i] + (vy * dt);
2583  posNew_z[i] = pos_z[i] + (vz * dt);
2584  } else {
2585  if (!atomFixed[i]) {
2586  posNew_x[i] = pos_x[i] + (vx * dt);
2587  posNew_y[i] = pos_y[i] + (vy * dt);
2588  posNew_z[i] = pos_z[i] + (vz * dt);
2589  } else {
2590  posNew_x[i] = pos_x[i];
2591  posNew_y[i] = pos_y[i];
2592  posNew_z[i] = pos_z[i];
2593  }
2594  }
2595  velNew_x[i] = vx;
2596  velNew_y[i] = vy;
2597  velNew_z[i] = vz;
2598  }
2599 }
2600 
2601 
2602 // JM NOTE: Optimize this kernel further to improve global memory access pattern
2603 __global__ void centerOfMassKernel(
2604  const double * __restrict coor_x,
2605  const double * __restrict coor_y,
2606  const double * __restrict coor_z,
2607  double * __restrict cm_x, // center of mass is atom-sized
2608  double * __restrict cm_y,
2609  double * __restrict cm_z,
2610  const float * __restrict mass,
2611  const int * __restrict hydrogenGroupSize,
2612  const int numAtoms)
2613 {
2614  int i = threadIdx.x + blockIdx.x*blockDim.x;
2615  if (i < numAtoms) {
2616  int hgs = hydrogenGroupSize[i];
2617  int j;
2618  if (hgs > 0) {
2619  // missing fixed atoms
2620  BigReal m_cm = 0;
2621  BigReal reg_cm_x = 0;
2622  BigReal reg_cm_y = 0;
2623  BigReal reg_cm_z = 0;
2624  for ( j = i; j < (i+hgs); ++j ) {
2625  m_cm += mass[j];
2626  reg_cm_x += mass[j] * coor_x[j];
2627  reg_cm_y += mass[j] * coor_y[j];
2628  reg_cm_z += mass[j] * coor_z[j];
2629  }
2630  BigReal inv_m_cm = 1.0/m_cm;
2631  reg_cm_x *= inv_m_cm;
2632  reg_cm_y *= inv_m_cm;
2633  reg_cm_z *= inv_m_cm;
2634 
2635  for(j = i ; j < (i + hgs); j++){
2636  cm_x[j] = reg_cm_x;
2637  cm_y[j] = reg_cm_y;
2638  cm_z[j] = reg_cm_z;
2639  }
2640  }
2641  }
2642 }
2643 
2644 void SequencerCUDAKernel::centerOfMass(
2645  const double *coor_x,
2646  const double *coor_y,
2647  const double *coor_z,
2648  double *cm_x,
2649  double *cm_y,
2650  double *cm_z,
2651  const float* mass,
2652  const int* hydrogenGroupSize,
2653  const int numAtoms,
2654  cudaStream_t stream
2655 ){
2656  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2657  //Calls kernels
2658  centerOfMassKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(coor_x, coor_y, coor_z,
2659  cm_x, cm_y, cm_z, mass, hydrogenGroupSize, numAtoms);
2660 }
2661 
2662 
2663 void SequencerCUDAKernel::updateRigidArrays(
2664  const bool doFixed,
2665  const double dt,
2666  const int *atomFixed,
2667  const double *vel_x,
2668  const double *vel_y,
2669  const double *vel_z,
2670  const double *pos_x,
2671  const double *pos_y,
2672  const double *pos_z,
2673  double *velNew_x,
2674  double *velNew_y,
2675  double *velNew_z,
2676  double *posNew_x,
2677  double *posNew_y,
2678  double *posNew_z,
2679  int numAtoms,
2680  cudaStream_t stream)
2681 {
2682  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2683  if (doFixed)
2684  updateRigidArraysKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2685  dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2686  velNew_x, velNew_y, velNew_z,
2687  posNew_x, posNew_y, posNew_z, numAtoms);
2688  else
2689  updateRigidArraysKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2690  dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2691  velNew_x, velNew_y, velNew_z,
2692  posNew_x, posNew_y, posNew_z, numAtoms);
2693  // cudaCheck(cudaGetLastError());
2694 }
2695 
2696 template<int BLOCK_SIZE, bool DO_FIXED>
2697 __global__ void submitHalfKernel(
2698  // const int* __restrict atomFixed,
2699  const double * __restrict vel_x,
2700  const double * __restrict vel_y,
2701  const double * __restrict vel_z,
2702  const double * __restrict vcm_x,
2703  const double * __restrict vcm_y,
2704  const double * __restrict vcm_z,
2705  const float * __restrict mass,
2706  BigReal *kineticEnergy,
2707  BigReal *intKineticEnergy,
2708  cudaTensor *virial,
2709  cudaTensor *intVirialNormal,
2710  BigReal *h_kineticEnergy,
2711  BigReal *h_intKineticEnergy,
2712  cudaTensor *h_virial,
2713  cudaTensor *h_intVirialNormal,
2714  int *hydrogenGroupSize,
2715  int numAtoms,
2716  unsigned int* tbcatomic)
2717 {
2718  BigReal k = 0, intK = 0;
2719  cudaTensor v, intV;
2720  zero_cudaTensor(v);
2721  zero_cudaTensor(intV);
2722  int i = threadIdx.x + blockIdx.x*blockDim.x;
2723  int totaltb = gridDim.x;
2724  __shared__ bool isLastBlockDone;
2725 
2726 
2727  if(threadIdx.x == 0){
2728  isLastBlockDone = 0;
2729  }
2730 
2731  __syncthreads();
2732 
2733  if (i < numAtoms) {
2734  k = mass[i] *
2735  (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
2736  v.xx = mass[i] * vel_x[i] * vel_x[i];
2737  if (DO_FIXED) {
2738  v.xy = mass[i] * vel_x[i] * vel_y[i];
2739  v.xz = mass[i] * vel_x[i] * vel_z[i];
2740  }
2741  v.yx = mass[i] * vel_y[i] * vel_x[i];
2742  v.yy = mass[i] * vel_y[i] * vel_y[i];
2743  if (DO_FIXED) {
2744  v.yz = mass[i] * vel_y[i] * vel_z[i];
2745  }
2746  v.zx = mass[i] * vel_z[i] * vel_x[i];
2747  v.zy = mass[i] * vel_z[i] * vel_y[i];
2748  v.zz = mass[i] * vel_z[i] * vel_z[i];
2749 
2750 #if 0
2751  int hgs = hydrogenGroupSize[i];
2752  if (hgs > 0) {
2753  BigReal m_cm = 0;
2754  BigReal v_cm_x = 0;
2755  BigReal v_cm_y = 0;
2756  BigReal v_cm_z = 0;
2757  for (int j = i; j < (i+hgs); j++) {
2758  m_cm += mass[j];
2759  v_cm_x += mass[j] * vel_x[j];
2760  v_cm_y += mass[j] * vel_y[j];
2761  v_cm_z += mass[j] * vel_z[j];
2762  }
2763  BigReal recip_m_cm = 1.0 / m_cm;
2764  v_cm_x *= recip_m_cm;
2765  v_cm_y *= recip_m_cm;
2766  v_cm_z *= recip_m_cm;
2767 
2768  for (int j = i; j < (i+hgs); j++) {
2769  BigReal dv_x = vel_x[j] - v_cm_x;
2770  BigReal dv_y = vel_y[j] - v_cm_y;
2771  BigReal dv_z = vel_z[j] - v_cm_z;
2772  intK += mass[j] *
2773  (vel_x[j] * dv_x + vel_y[j] * dv_y + vel_z[j] * dv_z);
2774  intV.xx += mass[j] * vel_x[j] * dv_x;
2775  intV.xy += mass[j] * vel_x[j] * dv_y;
2776  intV.xz += mass[j] * vel_x[j] * dv_z;
2777  intV.yx += mass[j] * vel_y[j] * dv_x;
2778  intV.yy += mass[j] * vel_y[j] * dv_y;
2779  intV.yz += mass[j] * vel_y[j] * dv_z;
2780  intV.zx += mass[j] * vel_z[j] * dv_x;
2781  intV.zy += mass[j] * vel_z[j] * dv_y;
2782  intV.zz += mass[j] * vel_z[j] * dv_z;
2783  }
2784  }
2785 #else
2786  // JM: New version with centers of mass calculated by a separate kernel
2787  BigReal v_cm_x = vcm_x[i];
2788  BigReal v_cm_y = vcm_y[i];
2789  BigReal v_cm_z = vcm_z[i];
2790  BigReal dv_x = vel_x[i] - v_cm_x;
2791  BigReal dv_y = vel_y[i] - v_cm_y;
2792  BigReal dv_z = vel_z[i] - v_cm_z;
2793  intK += mass[i] *
2794  (vel_x[i] * dv_x + vel_y[i] * dv_y + vel_z[i] * dv_z);
2795  intV.xx += mass[i] * vel_x[i] * dv_x;
2796  intV.xy += mass[i] * vel_x[i] * dv_y;
2797  intV.xz += mass[i] * vel_x[i] * dv_z;
2798  intV.yx += mass[i] * vel_y[i] * dv_x;
2799  intV.yy += mass[i] * vel_y[i] * dv_y;
2800  intV.yz += mass[i] * vel_y[i] * dv_z;
2801  intV.zx += mass[i] * vel_z[i] * dv_x;
2802  intV.zy += mass[i] * vel_z[i] * dv_y;
2803  intV.zz += mass[i] * vel_z[i] * dv_z;
2804 #endif
2805  }
2806 
2807  typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
2808  __shared__ typename BlockReduce::TempStorage temp_storage;
2809  k = BlockReduce(temp_storage).Sum(k);
2810  __syncthreads();
2811  v.xx = BlockReduce(temp_storage).Sum(v.xx);
2812  __syncthreads();
2813  if (DO_FIXED) {
2814  v.xy = BlockReduce(temp_storage).Sum(v.xy);
2815  __syncthreads();
2816  v.xz = BlockReduce(temp_storage).Sum(v.xz);
2817  __syncthreads();
2818  }
2819  v.yx = BlockReduce(temp_storage).Sum(v.yx);
2820  __syncthreads();
2821  v.yy = BlockReduce(temp_storage).Sum(v.yy);
2822  __syncthreads();
2823  if (DO_FIXED) {
2824  v.yz = BlockReduce(temp_storage).Sum(v.yz);
2825  __syncthreads();
2826  }
2827  v.zx = BlockReduce(temp_storage).Sum(v.zx);
2828  __syncthreads();
2829  v.zy = BlockReduce(temp_storage).Sum(v.zy);
2830  __syncthreads();
2831  v.zz = BlockReduce(temp_storage).Sum(v.zz);
2832  __syncthreads();
2833  intK = BlockReduce(temp_storage).Sum(intK);
2834  __syncthreads();
2835  intV.xx = BlockReduce(temp_storage).Sum(intV.xx);
2836  __syncthreads();
2837  intV.xy = BlockReduce(temp_storage).Sum(intV.xy);
2838  __syncthreads();
2839  intV.xz = BlockReduce(temp_storage).Sum(intV.xz);
2840  __syncthreads();
2841  intV.yx = BlockReduce(temp_storage).Sum(intV.yx);
2842  __syncthreads();
2843  intV.yy = BlockReduce(temp_storage).Sum(intV.yy);
2844  __syncthreads();
2845  intV.yz = BlockReduce(temp_storage).Sum(intV.yz);
2846  __syncthreads();
2847  intV.zx = BlockReduce(temp_storage).Sum(intV.zx);
2848  __syncthreads();
2849  intV.zy = BlockReduce(temp_storage).Sum(intV.zy);
2850  __syncthreads();
2851  intV.zz = BlockReduce(temp_storage).Sum(intV.zz);
2852  __syncthreads();
2853 
2854  if (threadIdx.x == 0) {
2855  const int bin = blockIdx.x % ATOMIC_BINS;
2856 
2857  atomicAdd(&kineticEnergy[bin], k);
2858  atomicAdd(&virial[bin].xx, v.xx);
2859  if (DO_FIXED) {
2860  atomicAdd(&virial[bin].xy, v.xy);
2861  atomicAdd(&virial[bin].xz, v.xz);
2862  }
2863  atomicAdd(&virial[bin].yx, v.yx);
2864  atomicAdd(&virial[bin].yy, v.yy);
2865  if (DO_FIXED) {
2866  atomicAdd(&virial[bin].yz, v.yz);
2867  }
2868  atomicAdd(&virial[bin].zx, v.zx);
2869  atomicAdd(&virial[bin].zy, v.zy);
2870  atomicAdd(&virial[bin].zz, v.zz);
2871  atomicAdd(&intKineticEnergy[bin], intK);
2872  atomicAdd(&intVirialNormal[bin].xx, intV.xx);
2873  atomicAdd(&intVirialNormal[bin].xy, intV.xy);
2874  atomicAdd(&intVirialNormal[bin].xz, intV.xz);
2875  atomicAdd(&intVirialNormal[bin].yx, intV.yx);
2876  atomicAdd(&intVirialNormal[bin].yy, intV.yy);
2877  atomicAdd(&intVirialNormal[bin].yz, intV.yz);
2878  atomicAdd(&intVirialNormal[bin].zx, intV.zx);
2879  atomicAdd(&intVirialNormal[bin].zy, intV.zy);
2880  atomicAdd(&intVirialNormal[bin].zz, intV.zz);
2881 
2882  __threadfence();
2883  unsigned int value = atomicInc(&tbcatomic[1], totaltb);
2884  isLastBlockDone = (value == (totaltb -1));
2885  }
2886  __syncthreads();
2887 
2888 #if 1
2889  if(isLastBlockDone){
2890  if(threadIdx.x < ATOMIC_BINS){
2891  const int bin = threadIdx.x;
2892 
2893  double k = kineticEnergy[bin];
2894  double intK = intKineticEnergy[bin];
2895  cudaTensor v = virial[bin];
2896  cudaTensor intV = intVirialNormal[bin];
2897 
2898  // sets device scalars back to zero
2899  kineticEnergy[bin] = 0.0;
2900  intKineticEnergy[bin] = 0.0;
2901  zero_cudaTensor(virial[bin]);
2902  zero_cudaTensor(intVirialNormal[bin]);
2903 
2904  if(ATOMIC_BINS > 1){
2905  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2906  typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
2907  __shared__ typename WarpReduce::TempStorage tempStorage;
2908  __shared__ typename WarpReduceT::TempStorage tempStorageT;
2909 
2910  k = WarpReduce(tempStorage).Sum(k);
2911  intK = WarpReduce(tempStorage).Sum(intK);
2912  v = WarpReduceT(tempStorageT).Sum(v);
2913  intV = WarpReduceT(tempStorageT).Sum(intV);
2914  }
2915 
2916  if(threadIdx.x == 0){
2917  h_kineticEnergy[0] = k;
2918  h_intKineticEnergy[0] = intK;
2919  h_virial[0] = v;
2920  h_intVirialNormal[0] = intV;
2921 
2922  //resets atomic counter
2923  reset_atomic_counter(&tbcatomic[1]);
2924  }
2925  }
2926  }
2927 #endif
2928 }
2929 
2930 void SequencerCUDAKernel::submitHalf(
2931  const bool doFixedAtoms,
2932  const double *vel_x,
2933  const double *vel_y,
2934  const double *vel_z,
2935  const double *vcm_x,
2936  const double *vcm_y,
2937  const double *vcm_z,
2938  const float *mass,
2939  BigReal *kineticEnergy,
2940  BigReal *intKineticEnergy,
2941  cudaTensor *virial,
2942  cudaTensor *intVirialNormal,
2943  BigReal *h_kineticEnergy,
2944  BigReal *h_intKineticEnergy,
2945  cudaTensor *h_virial,
2946  cudaTensor *h_intVirialNormal,
2947  int *hydrogenGroupSize,
2948  int numAtoms,
2949  unsigned int* tbcatomic,
2950  cudaStream_t stream)
2951 {
2952  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2953  if (doFixedAtoms)
2954  submitHalfKernel<ATOM_BLOCKS, true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2955  vel_x, vel_y, vel_z,
2956  vcm_x, vcm_y, vcm_z, mass,
2957  kineticEnergy, intKineticEnergy,
2958  virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2959  h_virial, h_intVirialNormal,
2960  hydrogenGroupSize, numAtoms, tbcatomic);
2961  else
2962  submitHalfKernel<ATOM_BLOCKS, false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2963  vel_x, vel_y, vel_z,
2964  vcm_x, vcm_y, vcm_z, mass,
2965  kineticEnergy, intKineticEnergy,
2966  virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2967  h_virial, h_intVirialNormal,
2968  hydrogenGroupSize, numAtoms, tbcatomic);
2969  //cudaCheck(cudaGetLastError());
2970 }
2971 
2972 __global__ void scaleCoordinateUseGroupPressureKernel(
2973  double * __restrict pos_x,
2974  double * __restrict pos_y,
2975  double * __restrict pos_z,
2976  float *mass,
2977  int *hydrogenGroupSize,
2978  cudaTensor factor,
2979  cudaVector origin,
2980  int numAtoms)
2981 {
2982  int i = threadIdx.x + blockIdx.x * blockDim.x;
2983  if (i < numAtoms) {
2984  int hgs = hydrogenGroupSize[i];
2985  int j;
2986  if (hgs > 0) {
2987  // missing fixed atoms implementation
2988  BigReal m_cm = 0;
2989  BigReal r_cm_x = 0;
2990  BigReal r_cm_y = 0;
2991  BigReal r_cm_z = 0;
2992  // calculate the center of mass
2993  for ( j = i; j < (i+hgs); ++j ) {
2994  m_cm += mass[j];
2995  r_cm_x += mass[j] * pos_x[j];
2996  r_cm_y += mass[j] * pos_y[j];
2997  r_cm_z += mass[j] * pos_z[j];
2998  }
2999  BigReal inv_m_cm = 1.0/m_cm;
3000  r_cm_x *= inv_m_cm;
3001  r_cm_y *= inv_m_cm;
3002  r_cm_z *= inv_m_cm;
3003  // scale the center of mass with factor
3004  // shift to origin
3005  double tx = r_cm_x - origin.x;
3006  double ty = r_cm_y - origin.y;
3007  double tz = r_cm_z - origin.z;
3008  // apply transformation
3009  double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3010  double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3011  double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3012  // shift back
3013  new_r_cm_x += origin.x;
3014  new_r_cm_y += origin.y;
3015  new_r_cm_z += origin.z;
3016  // translation vector from old COM and new COM
3017  double delta_r_cm_x = new_r_cm_x - r_cm_x;
3018  double delta_r_cm_y = new_r_cm_y - r_cm_y;
3019  double delta_r_cm_z = new_r_cm_z - r_cm_z;
3020  // shift the hydrogen group with translation vector
3021  for (j = i; j < (i+hgs); j++) {
3022  pos_x[j] += delta_r_cm_x;
3023  pos_y[j] += delta_r_cm_y;
3024  pos_z[j] += delta_r_cm_z;
3025  }
3026  }
3027  }
3028 }
3029 
3030 __global__ void scaleCoordinateNoGroupPressureKernel(
3031  double * __restrict pos_x,
3032  double * __restrict pos_y,
3033  double * __restrict pos_z,
3034  cudaTensor factor,
3035  cudaVector origin,
3036  int numAtoms)
3037 {
3038  int i = threadIdx.x + blockIdx.x * blockDim.x;
3039  if (i < numAtoms) {
3040  // missing fixed atoms implementation
3041  // shift to origin
3042  double tx = pos_x[i] - origin.x;
3043  double ty = pos_y[i] - origin.y;
3044  double tz = pos_z[i] - origin.z;
3045  // apply transformation
3046  double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3047  double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3048  double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3049  // shift back
3050  pos_x[i] = ftx + origin.x;
3051  pos_y[i] = fty + origin.y;
3052  pos_z[i] = ftz + origin.z;
3053  }
3054 }
3055 
3056 __global__ void SetAtomIndexOrderKernel(
3057  int *id,
3058  int *idOrder,
3059  int numAtoms)
3060 {
3061  int i = threadIdx.x + blockIdx.x * blockDim.x;
3062  if (i < numAtoms) {
3063  int atomIndex = id[i];
3064  idOrder[atomIndex] = i;
3065  }
3066 }
3067 
3068 __global__ void scaleCoordinateUsingGCKernel(
3069  double* __restrict pos_x,
3070  double* __restrict pos_y,
3071  double* __restrict pos_z,
3072  const int* __restrict idOrder,
3073  const int* __restrict moleculeStartIndex,
3074  const int* __restrict moleculeAtom,
3075  const cudaTensor factor,
3076  const cudaVector origin,
3077  const Lattice oldLattice,
3078  const Lattice newLattice,
3079  const char3* __restrict transform,
3080  const int numMolecules,
3081  const int numLargeMolecules)
3082 {
3083  // missing fixed atoms implementation
3084  int startIdx, endIdx, i, j, jmapped, atomIndex;
3085  double3 position, r_gc, new_r_gc, delta_r_gc;
3086  char3 tr;
3087  r_gc.x = 0; r_gc.y = 0; r_gc.z = 0;
3088  if (blockIdx.x < numLargeMolecules){ //large molecule case
3089  i = blockIdx.x;
3090  startIdx = moleculeStartIndex[i];
3091  endIdx = moleculeStartIndex[i + 1];
3092  __shared__ double3 sh_gc;
3093  double inv_length = 1.0/(double)(endIdx - startIdx);
3094  // calculate the geometric center
3095  for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3096  atomIndex = moleculeAtom[j];
3097  jmapped = idOrder[atomIndex];
3098  tr = transform[jmapped];
3099  position.x = pos_x[jmapped];
3100  position.y = pos_y[jmapped];
3101  position.z = pos_z[jmapped];
3102  //Unwrap the coordinate with oldLattice
3103  position = oldLattice.reverse_transform(position ,tr);
3104  r_gc.x += position.x;
3105  r_gc.y += position.y;
3106  r_gc.z += position.z;
3107  }
3108  __syncthreads();
3109  typedef cub::BlockReduce<double, 64> BlockReduce;
3110  __shared__ typename BlockReduce::TempStorage temp_storage;
3111 
3112  r_gc.x = BlockReduce(temp_storage).Sum(r_gc.x);
3113  __syncthreads();
3114  r_gc.y = BlockReduce(temp_storage).Sum(r_gc.y);
3115  __syncthreads();
3116  r_gc.z = BlockReduce(temp_storage).Sum(r_gc.z);
3117  __syncthreads();
3118 
3119  if (threadIdx.x == 0) {
3120  sh_gc.x = r_gc.x * inv_length;
3121  sh_gc.y = r_gc.y * inv_length;
3122  sh_gc.z = r_gc.z * inv_length;
3123  }
3124  __syncthreads();
3125 
3126  // scale the geometric center with factor
3127  // shift to origin
3128  double tx = sh_gc.x - origin.x;
3129  double ty = sh_gc.y - origin.y;
3130  double tz = sh_gc.z - origin.z;
3131  // apply transformation
3132  new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3133  new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3134  new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3135  // shift back
3136  new_r_gc.x += origin.x;
3137  new_r_gc.y += origin.y;
3138  new_r_gc.z += origin.z;
3139  // translation vector from old GC to new GC
3140  delta_r_gc.x = new_r_gc.x - sh_gc.x;
3141  delta_r_gc.y = new_r_gc.y - sh_gc.y;
3142  delta_r_gc.z = new_r_gc.z - sh_gc.z;
3143 
3144  // shift the atoms in molecule with translation vector
3145  for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3146  atomIndex = moleculeAtom[j];
3147  jmapped = idOrder[atomIndex];
3148  tr = transform[jmapped];
3149  position.x = pos_x[jmapped];
3150  position.y = pos_y[jmapped];
3151  position.z = pos_z[jmapped];
3152  //Unwrap the coordinate with oldLattice
3153  position = oldLattice.reverse_transform(position, tr);
3154  position.x += delta_r_gc.x;
3155  position.y += delta_r_gc.y;
3156  position.z += delta_r_gc.z;
3157  // wrap the coordinate with new lattice parameter
3158  position = newLattice.apply_transform(position, tr);
3159  pos_x[jmapped] = position.x;
3160  pos_y[jmapped] = position.y;
3161  pos_z[jmapped] = position.z;
3162  }
3163  } else { //Small molecule
3164  // numLargeMolecules block has been assigned to large molecule
3165  i = numLargeMolecules + threadIdx.x +
3166  (blockIdx.x - numLargeMolecules) * blockDim.x;
3167 
3168  if (i < numMolecules) {
3169  startIdx = moleculeStartIndex[i];
3170  endIdx = moleculeStartIndex[i+1];
3171  double inv_length = 1.0/(double)(endIdx - startIdx);
3172 
3173  // calculate the geometric center
3174  for ( j = startIdx; j < endIdx; j++ ) {
3175  atomIndex = moleculeAtom[j];
3176  jmapped = idOrder[atomIndex];
3177  tr = transform[jmapped];
3178  position.x = pos_x[jmapped];
3179  position.y = pos_y[jmapped];
3180  position.z = pos_z[jmapped];
3181  //Unwrap the coordinate with oldLattice
3182  position = oldLattice.reverse_transform(position, tr);
3183  r_gc.x += position.x;
3184  r_gc.y += position.y;
3185  r_gc.z += position.z;
3186  }
3187 
3188  r_gc.x *= inv_length;
3189  r_gc.y *= inv_length;
3190  r_gc.z *= inv_length;
3191 
3192  // scale the geometric center with factor
3193  // shift to origin
3194  double tx = r_gc.x - origin.x;
3195  double ty = r_gc.y - origin.y;
3196  double tz = r_gc.z - origin.z;
3197  // apply transformation
3198  new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3199  new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3200  new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3201  // shift back
3202  new_r_gc.x += origin.x;
3203  new_r_gc.y += origin.y;
3204  new_r_gc.z += origin.z;
3205  // translation vector from old GC to new GC
3206  delta_r_gc.x = new_r_gc.x - r_gc.x;
3207  delta_r_gc.y = new_r_gc.y - r_gc.y;
3208  delta_r_gc.z = new_r_gc.z - r_gc.z;
3209 
3210  // shift the atoms in molecule with translation vector
3211  for (j = startIdx; j < endIdx; j++) {
3212  atomIndex = moleculeAtom[j];
3213  jmapped = idOrder[atomIndex];
3214 
3215  tr = transform[jmapped];
3216  position.x = pos_x[jmapped];
3217  position.y = pos_y[jmapped];
3218  position.z = pos_z[jmapped];
3219  // unwrap the coordinate with oldLattice
3220  position = oldLattice.reverse_transform(position, tr);
3221  position.x += delta_r_gc.x;
3222  position.y += delta_r_gc.y;
3223  position.z += delta_r_gc.z;
3224  // wrap the coordinate with new lattice parameter
3225  position = newLattice.apply_transform(position, tr);
3226  pos_x[jmapped] = position.x;
3227  pos_y[jmapped] = position.y;
3228  pos_z[jmapped] = position.z;
3229  }
3230  }
3231  }
3232 }
3233 
3234 template <bool doFixed>
3235 __global__ void langevinPistonUseGroupPressureKernel(
3236  const int* __restrict atomFixed,
3237  const int* __restrict groupFixed,
3238  const Lattice lattice,
3239  const char3* transform,
3240  const double* __restrict fixedPosition_x,
3241  const double* __restrict fixedPosition_y,
3242  const double* __restrict fixedPosition_z,
3243  double * __restrict pos_x,
3244  double * __restrict pos_y,
3245  double * __restrict pos_z,
3246  double * __restrict vel_x,
3247  double * __restrict vel_y,
3248  double * __restrict vel_z,
3249  float *mass,
3250  int *hydrogenGroupSize,
3251  cudaTensor factor,
3252  cudaVector origin,
3253  double velFactor_x,
3254  double velFactor_y,
3255  double velFactor_z,
3256  int numAtoms)
3257 {
3258  int i = threadIdx.x + blockIdx.x*blockDim.x;
3259  if (i < numAtoms) {
3260  int hgs = hydrogenGroupSize[i];
3261  int j;
3262  if (hgs > 0) {
3263  if (doFixed) {
3264  if (groupFixed[i]) {
3265  for ( j = i; j < (i+hgs); ++j ) {
3266  const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3267  const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3268  pos_x[j] = pos_trans.x;
3269  pos_y[j] = pos_trans.y;
3270  pos_z[j] = pos_trans.z;
3271  }
3272  return;
3273  }
3274  }
3275  BigReal m_cm = 0;
3276  BigReal r_cm_x = 0;
3277  BigReal r_cm_y = 0;
3278  BigReal r_cm_z = 0;
3279  BigReal v_cm_x = 0;
3280  BigReal v_cm_y = 0;
3281  BigReal v_cm_z = 0;
3282  for ( j = i; j < (i+hgs); ++j ) {
3283  if (doFixed) {
3284  if (atomFixed[j]) continue;
3285  }
3286  m_cm += mass[j];
3287  r_cm_x += mass[j] * pos_x[j];
3288  r_cm_y += mass[j] * pos_y[j];
3289  r_cm_z += mass[j] * pos_z[j];
3290  v_cm_x += mass[j] * vel_x[j];
3291  v_cm_y += mass[j] * vel_y[j];
3292  v_cm_z += mass[j] * vel_z[j];
3293  }
3294  BigReal inv_m_cm = 1.0/m_cm;
3295  r_cm_x *= inv_m_cm;
3296  r_cm_y *= inv_m_cm;
3297  r_cm_z *= inv_m_cm;
3298 
3299  double tx = r_cm_x - origin.x;
3300  double ty = r_cm_y - origin.y;
3301  double tz = r_cm_z - origin.z;
3302  double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3303  double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3304  double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3305  new_r_cm_x += origin.x;
3306  new_r_cm_y += origin.y;
3307  new_r_cm_z += origin.z;
3308 
3309  double delta_r_cm_x = new_r_cm_x - r_cm_x;
3310  double delta_r_cm_y = new_r_cm_y - r_cm_y;
3311  double delta_r_cm_z = new_r_cm_z - r_cm_z;
3312  v_cm_x *= inv_m_cm;
3313  v_cm_y *= inv_m_cm;
3314  v_cm_z *= inv_m_cm;
3315  double delta_v_cm_x = ( velFactor_x - 1 ) * v_cm_x;
3316  double delta_v_cm_y = ( velFactor_y - 1 ) * v_cm_y;
3317  double delta_v_cm_z = ( velFactor_z - 1 ) * v_cm_z;
3318  for (j = i; j < (i+hgs); j++) {
3319  if (doFixed) {
3320  if (atomFixed[j]) {
3321  const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3322  const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3323  pos_x[j] = pos_trans.x;
3324  pos_y[j] = pos_trans.y;
3325  pos_z[j] = pos_trans.z;
3326  continue;
3327  }
3328  }
3329  pos_x[j] += delta_r_cm_x;
3330  pos_y[j] += delta_r_cm_y;
3331  pos_z[j] += delta_r_cm_z;
3332  vel_x[j] += delta_v_cm_x;
3333  vel_y[j] += delta_v_cm_y;
3334  vel_z[j] += delta_v_cm_z;
3335  }
3336  }
3337 
3338  }
3339 }
3340 
3341 template <bool doFixed>
3342 __global__ void langevinPistonNoGroupPressureKernel(
3343  const int* __restrict atomFixed,
3344  const Lattice lattice,
3345  const char3* transform,
3346  const double* __restrict fixedPosition_x,
3347  const double* __restrict fixedPosition_y,
3348  const double* __restrict fixedPosition_z,
3349  double * __restrict pos_x,
3350  double * __restrict pos_y,
3351  double * __restrict pos_z,
3352  double * __restrict vel_x,
3353  double * __restrict vel_y,
3354  double * __restrict vel_z,
3355  cudaTensor factor,
3356  cudaVector origin,
3357  double velFactor_x,
3358  double velFactor_y,
3359  double velFactor_z,
3360  int numAtoms)
3361 {
3362  int i = threadIdx.x + blockIdx.x*blockDim.x;
3363  if (i < numAtoms) {
3364  if (doFixed) {
3365  if (atomFixed[i]) {
3366  const double3 pos_origin{fixedPosition_x[i], fixedPosition_y[i], fixedPosition_z[i]};
3367  const auto pos_trans = lattice.apply_transform(pos_origin, transform[i]);
3368  pos_x[i] = pos_trans.x;
3369  pos_y[i] = pos_trans.y;
3370  pos_z[i] = pos_trans.z;
3371  return;
3372  }
3373  }
3374  double tx = pos_x[i] - origin.x;
3375  double ty = pos_y[i] - origin.y;
3376  double tz = pos_z[i] - origin.z;
3377  double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3378  double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3379  double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3380  pos_x[i] = ftx + origin.x;
3381  pos_y[i] = fty + origin.y;
3382  pos_z[i] = ftz + origin.z;
3383  vel_x[i] *= velFactor_x;
3384  vel_y[i] *= velFactor_y;
3385  vel_z[i] *= velFactor_z;
3386  }
3387 }
3388 
3389 void SequencerCUDAKernel::scaleCoordinateWithFactor(
3390  double *pos_x,
3391  double *pos_y,
3392  double *pos_z,
3393  float *mass,
3394  int *hydrogenGroupSize,
3395  cudaTensor factor,
3396  cudaVector origin,
3397  int useGroupPressure,
3398  int numAtoms,
3399  cudaStream_t stream)
3400 {
3401  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3402  if (useGroupPressure) {
3403  scaleCoordinateUseGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3404  pos_x, pos_y, pos_z, mass, hydrogenGroupSize, factor, origin, numAtoms);
3405  } else {
3406  scaleCoordinateNoGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3407  pos_x, pos_y, pos_z, factor, origin, numAtoms);
3408  }
3409 }
3410 
3411 
3412 void SequencerCUDAKernel::SetAtomIndexOrder(
3413  int *id,
3414  int *idOrder,
3415  int numAtoms,
3416  cudaStream_t stream)
3417 {
3418  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3419  SetAtomIndexOrderKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3420  id, idOrder, numAtoms);
3421 }
3422 
3423 void SequencerCUDAKernel::scaleCoordinateUsingGC(
3424  double *pos_x,
3425  double *pos_y,
3426  double *pos_z,
3427  const int *idOrder,
3428  const int *moleculeStartIndex,
3429  const int *moleculeAtom,
3430  const cudaTensor factor,
3431  const cudaVector origin,
3432  const Lattice oldLattice,
3433  const Lattice newLattice,
3434  const char3 *transform,
3435  const int numMolecules,
3436  const int numLargeMolecules,
3437  cudaStream_t stream)
3438 {
3439  // we want each thread to calculate geometric center for molecule with
3440  // less than 128 atoms, and 1 threadblock to calculate each molecule
3441  // with larger than 128 atoms
3442  int numThreadsPerBlock = 64;
3443  int grid = ((numMolecules - numLargeMolecules + numThreadsPerBlock - 1) /
3444  numThreadsPerBlock) + numLargeMolecules;
3445  scaleCoordinateUsingGCKernel<<<grid, numThreadsPerBlock, 0, stream>>>(
3446  pos_x, pos_y, pos_z, idOrder, moleculeStartIndex,
3447  moleculeAtom, factor, origin, oldLattice, newLattice,
3448  transform, numMolecules, numLargeMolecules);
3449 
3450 }
3451 
3452 void SequencerCUDAKernel::langevinPiston(
3453  const bool doFixedAtoms,
3454  const int* atomFixed,
3455  const int* groupFixed,
3456  const char3* transform,
3457  const Lattice lattice,
3458  const double* fixedPosition_x,
3459  const double* fixedPosition_y,
3460  const double* fixedPosition_z,
3461  double *pos_x,
3462  double *pos_y,
3463  double *pos_z,
3464  double *vel_x,
3465  double *vel_y,
3466  double *vel_z,
3467  float *mass,
3468  int *hydrogenGroupSize,
3469  cudaTensor factor,
3470  cudaVector origin,
3471  double velFactor_x,
3472  double velFactor_y,
3473  double velFactor_z,
3474  int useGroupPressure,
3475  int numAtoms,
3476  cudaStream_t stream)
3477 {
3478  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3479  #if 0
3480  double *h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3481  double *h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3482  double *h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3483 
3484  double *h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3485  double *h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3486  double *h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3487 
3488  copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3489  copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3490  copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3491 
3492  copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3493  copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3494  copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3495 
3496  fprintf(stderr, "velFactors = %lf %lf %lf\n",
3497  velFactor_x, velFactor_y, velFactor_z);
3498  for(int i = 0; i < numAtoms; i++){
3499  fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3500  h_pos_x[i], h_pos_y[i], h_pos_z[i],
3501  h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3502  }
3503  free(h_pos_x);
3504  free(h_pos_y);
3505  free(h_pos_z);
3506  free(h_vel_x);
3507  free(h_vel_y);
3508  free(h_vel_z);
3509  #endif
3510 
3511  if (useGroupPressure) {
3512  if (doFixedAtoms) {
3513  langevinPistonUseGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3514  atomFixed, groupFixed, lattice, transform,
3515  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3516  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3517  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3518  } else {
3519  langevinPistonUseGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3520  atomFixed, groupFixed, lattice, transform,
3521  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3522  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3523  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3524  }
3525  } else {
3526  if (doFixedAtoms) {
3527  langevinPistonNoGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3528  atomFixed, lattice, transform,
3529  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3530  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3531  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3532  } else {
3533  langevinPistonNoGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3534  atomFixed, lattice, transform,
3535  fixedPosition_x, fixedPosition_y, fixedPosition_z,
3536  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3537  factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3538  }
3539  }
3540  //cudaCheck(cudaGetLastError());
3541 #if 0
3542  h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3543  h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3544  h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3545 
3546  h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3547  h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3548  h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3549 
3550  copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3551  copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3552  copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3553 
3554  copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3555  copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3556  copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3557 
3558  for(int i = 0; i < numAtoms; i++){
3559  fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3560  h_pos_x[i], h_pos_y[i], h_pos_z[i],
3561  h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3562  }
3563 #endif
3564 }
3565 
3566 template <int BLOCK_SIZE>
3567 __global__ void submitReduction1Kernel(
3568  double * __restrict pos_x,
3569  double * __restrict pos_y,
3570  double * __restrict pos_z,
3571  double * __restrict vel_x,
3572  double * __restrict vel_y,
3573  double * __restrict vel_z,
3574  float * __restrict mass,
3575  // TODO: wrap these scalars in a struct
3576  BigReal *kineticEnergy,
3577  BigReal *momentum_x,
3578  BigReal *momentum_y,
3579  BigReal *momentum_z,
3580  BigReal *angularMomentum_x,
3581  BigReal *angularMomentum_y,
3582  BigReal *angularMomentum_z,
3583  BigReal origin_x,
3584  BigReal origin_y,
3585  BigReal origin_z,
3586  BigReal *h_kineticEnergy,
3587  BigReal *h_momentum_x,
3588  BigReal *h_momentum_y,
3589  BigReal *h_momentum_z,
3590  BigReal *h_angularMomentum_x,
3591  BigReal *h_angularMomentum_y,
3592  BigReal *h_angularMomentum_z,
3593  unsigned int* tbcatomic,
3594  int numAtoms)
3595 {
3596  BigReal m_x = 0, m_y = 0, m_z = 0,
3597  a_x = 0, a_y = 0, a_z = 0;
3598  int i = threadIdx.x + blockIdx.x*blockDim.x;
3599  int totaltb = gridDim.x;
3600  __shared__ bool isLastBlockDone;
3601 
3602  if(threadIdx.x == 0){
3603  isLastBlockDone = 0;
3604  }
3605  __syncthreads();
3606 
3607  if (i < numAtoms) {
3608  // scalar kineticEnergy += mass[i] * dot_product(vel[i], vel[i])
3609  // k += mass[i] *
3610  // (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
3611 
3612  // vector momentum += mass[i] * vel[i]
3613  m_x += mass[i] * vel_x[i];
3614  m_y += mass[i] * vel_y[i];
3615  m_z += mass[i] * vel_z[i];
3616 
3617  // vector dpos = pos[i] - origin
3618  BigReal dpos_x = pos_x[i] - origin_x;
3619  BigReal dpos_y = pos_y[i] - origin_y;
3620  BigReal dpos_z = pos_z[i] - origin_z;
3621 
3622  // vector angularMomentum += mass[i] * cross_product(dpos, vel[i])
3623  a_x += mass[i] * (dpos_y*vel_z[i] - dpos_z*vel_y[i]);
3624  a_y += mass[i] * (dpos_z*vel_x[i] - dpos_x*vel_z[i]);
3625  a_z += mass[i] * (dpos_x*vel_y[i] - dpos_y*vel_x[i]);
3626  }
3627  typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
3628  __shared__ typename BlockReduce::TempStorage temp_storage;
3629  // k = BlockReduce(temp_storage).Sum(k);
3630  // __syncthreads();
3631 
3632  m_x = BlockReduce(temp_storage).Sum(m_x);
3633  __syncthreads();
3634  m_y = BlockReduce(temp_storage).Sum(m_y);
3635  __syncthreads();
3636  m_z = BlockReduce(temp_storage).Sum(m_z);
3637  __syncthreads();
3638  a_x = BlockReduce(temp_storage).Sum(a_x);
3639  __syncthreads();
3640  a_y = BlockReduce(temp_storage).Sum(a_y);
3641  __syncthreads();
3642  a_z = BlockReduce(temp_storage).Sum(a_z);
3643  __syncthreads();
3644 
3645  if (threadIdx.x == 0) {
3646  const int bin = blockIdx.x % ATOMIC_BINS;
3647 
3648  // atomicAdd(&kineticEnergy[bin], k);
3649  atomicAdd(&momentum_x[bin], m_x);
3650  atomicAdd(&momentum_y[bin], m_y);
3651  atomicAdd(&momentum_z[bin], m_z);
3652  atomicAdd(&angularMomentum_x[bin], a_x);
3653  atomicAdd(&angularMomentum_y[bin], a_y);
3654  atomicAdd(&angularMomentum_z[bin], a_z);
3655  __threadfence();
3656  unsigned int value = atomicInc(&tbcatomic[0], totaltb);
3657  isLastBlockDone = (value == (totaltb-1));
3658  }
3659 
3660  __syncthreads();
3661 
3662 #if 1
3663  if(isLastBlockDone){
3664  if(threadIdx.x < ATOMIC_BINS){
3665  const int bin = threadIdx.x;
3666 
3667  double m_x = momentum_x[bin];
3668  double m_y = momentum_y[bin];
3669  double m_z = momentum_z[bin];
3670  double a_x = angularMomentum_x[bin];
3671  double a_y = angularMomentum_y[bin];
3672  double a_z = angularMomentum_z[bin];
3673 
3674  // sets device scalars back to zero
3675  kineticEnergy[0] = 0.0;
3676  momentum_x[bin] = 0.0;
3677  momentum_y[bin] = 0.0;
3678  momentum_z[bin] = 0.0;
3679  angularMomentum_x[bin] = 0.0;
3680  angularMomentum_y[bin] = 0.0;
3681  angularMomentum_z[bin] = 0.0;
3682 
3683  if(ATOMIC_BINS > 1){
3684  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
3685  __shared__ typename WarpReduce::TempStorage tempStorage;
3686 
3687  m_x = WarpReduce(tempStorage).Sum(m_x);
3688  m_y = WarpReduce(tempStorage).Sum(m_y);
3689  m_z = WarpReduce(tempStorage).Sum(m_z);
3690  a_x = WarpReduce(tempStorage).Sum(a_x);
3691  a_y = WarpReduce(tempStorage).Sum(a_y);
3692  a_z = WarpReduce(tempStorage).Sum(a_z);
3693  }
3694 
3695  if(threadIdx.x == 0){
3696  h_momentum_x[0] = m_x;
3697  h_momentum_y[0] = m_y;
3698  h_momentum_z[0] = m_z;
3699  h_angularMomentum_x[0] = a_x;
3700  h_angularMomentum_y[0] = a_y;
3701  h_angularMomentum_z[0] = a_z;
3702 
3703  //resets atomic counter
3704  reset_atomic_counter(&tbcatomic[0]);
3705  }
3706  }
3707  }
3708 #endif
3709 }
3710 
3711 
3712 // JM: does addForcetoMomentum, maximumMove, and addVelocityToPosition
3713 template <bool DO_VEL_RESCALING, bool DO_FIXED>
3714 __global__ void velocityVerlet1Kernel(
3715  const int step,
3716  const double scaling,
3717  const double dt_normal,
3718  const double dt_nbond,
3719  const double dt_slow,
3720  const double velrescaling, // for stochastic velocity rescaling
3721  const double* __restrict recipMass,
3722  double* __restrict vel_x,
3723  double* __restrict vel_y,
3724  double* __restrict vel_z,
3725  const double maxvel2,
3726  int* h_killme,
3727  double* __restrict pos_x,
3728  double* __restrict pos_y,
3729  double* __restrict pos_z,
3730  double* h_pos_x,
3731  double* h_pos_y,
3732  double* h_pos_z,
3733  double* __restrict f_normal_x,
3734  double* __restrict f_normal_y,
3735  double* __restrict f_normal_z,
3736  double* __restrict f_nbond_x,
3737  double* __restrict f_nbond_y,
3738  double* __restrict f_nbond_z,
3739  double* __restrict f_slow_x,
3740  double* __restrict f_slow_y,
3741  double* __restrict f_slow_z,
3742  const int* atomFixed,
3743  const int numAtoms,
3744  const int maxForceNumber
3745  ){
3746  // fusion of addForceToMomentum, maximumMove, addVelocityToPosition
3747  double dt, dt_b, dt_nb, dt_s;
3748  int i = threadIdx.x + blockIdx.x*blockDim.x;
3749 
3750  if(i < numAtoms){
3751  dt = dt_normal;
3752  double velx = vel_x[i];
3753  double vely = vel_y[i];
3754  double velz = vel_z[i];
3755  if (DO_VEL_RESCALING) {
3756  velx *= velrescaling;
3757  vely *= velrescaling;
3758  velz *= velrescaling;
3759  }
3760  // JM NOTE: these need to be patch-centered
3761  double posx = pos_x[i];
3762  double posy = pos_y[i];
3763  double posz = pos_z[i];
3764  double rmass = recipMass[i];
3765  /* addForceToMomentum*/
3766  // keep velocities in registers so I can access them faster when calculating positions!
3767  switch(maxForceNumber){
3768  case 2:
3769  dt_s = dt_slow * scaling;
3770  velx += f_slow_x[i] * rmass * dt_s;
3771  vely += f_slow_y[i] * rmass * dt_s;
3772  velz += f_slow_z[i] * rmass * dt_s;
3773  // f_slow_x[i] = 0.0;
3774  // f_slow_y[i] = 0.0;
3775  // f_slow_z[i] = 0.0;
3776  case 1:
3777  dt_nb = dt_nbond * scaling;
3778  velx += f_nbond_x[i] * rmass * dt_nb;
3779  vely += f_nbond_y[i] * rmass * dt_nb;
3780  velz += f_nbond_z[i] * rmass * dt_nb;
3781  // f_nbond_x[i] = 0.0;
3782  // f_nbond_y[i] = 0.0;
3783  // f_nbond_z[i] = 0.0;
3784  case 0:
3785  dt_b = dt_normal * scaling;
3786  velx += f_normal_x[i] * rmass * dt_b;
3787  vely += f_normal_y[i] * rmass * dt_b;
3788  velz += f_normal_z[i] * rmass * dt_b;
3789  // f_normal_x[i] = 0.0;
3790  // f_normal_y[i] = 0.0;
3791  // f_normal_z[i] = 0.0;
3792  }
3793 
3794  // --------------------------------------------------------
3795 
3796  // -- MaximumMove --
3797  double vel2 = velx * velx + vely * vely + velz * velz;
3798  if(vel2 > maxvel2) atomicAdd(h_killme, 1); // stops dynamics if true, perf does not matter
3799  // --------------------------------------------------------
3800 
3801  // -- AddVelocityToPosition --
3802  // ---------------------------------------------------------
3803  if (!DO_FIXED) {
3804  posx += velx * dt;
3805  posy += vely * dt;
3806  posz += velz * dt;
3807  pos_x[i] = posx;
3808  pos_y[i] = posy;
3809  pos_z[i] = posz;
3810  vel_x[i] = velx;
3811  vel_y[i] = vely;
3812  vel_z[i] = velz;
3813  } else {
3814  // Haochuan: for fixed atoms, we need to clear their velocities (see HomePatch::addForceToMomentum)
3815  if (!atomFixed[i]) {
3816  posx += velx * dt;
3817  posy += vely * dt;
3818  posz += velz * dt;
3819  pos_x[i] = posx;
3820  pos_y[i] = posy;
3821  pos_z[i] = posz;
3822  vel_x[i] = velx;
3823  vel_y[i] = vely;
3824  vel_z[i] = velz;
3825  } else {
3826  vel_x[i] = 0;
3827  vel_y[i] = 0;
3828  vel_z[i] = 0;
3829  }
3830  }
3831  }
3832 
3833 }
3834 
3835 void SequencerCUDAKernel::velocityVerlet1(
3836  const bool doFixedAtoms,
3837  const int step,
3838  const double scaling,
3839  const double dt_normal,
3840  const double dt_nbond,
3841  const double dt_slow,
3842  const double velrescaling, // for stochastic velocity rescaling
3843  const double *recipMass,
3844  double* vel_x,
3845  double* vel_y,
3846  double* vel_z,
3847  double maxvel2,
3848  int* h_killme,
3849  double* pos_x,
3850  double* pos_y,
3851  double* pos_z,
3852  double* h_pos_x,
3853  double* h_pos_y,
3854  double* h_pos_z,
3855  double* f_normal_x,
3856  double* f_normal_y,
3857  double* f_normal_z,
3858  double* f_nbond_x,
3859  double* f_nbond_y,
3860  double* f_nbond_z,
3861  double* f_slow_x,
3862  double* f_slow_y,
3863  double* f_slow_z,
3864  const int* atomFixed,
3865  const int numAtoms,
3866  const int maxForceNumber,
3867  cudaStream_t stream)
3868 {
3869  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3870  const bool doVelRescaling = (velrescaling != 1.0);
3871 #define CALL(DO_VEL_RESCALING, DO_FIXED) \
3872  velocityVerlet1Kernel<DO_VEL_RESCALING, DO_FIXED> \
3873  <<<grid, ATOM_BLOCKS, 0, stream>>>( \
3874  step, scaling, dt_normal, dt_nbond, dt_slow, velrescaling, recipMass, \
3875  vel_x, vel_y, vel_z, maxvel2, h_killme, \
3876  pos_x, pos_y, pos_z, h_pos_x, h_pos_y, h_pos_z, \
3877  f_normal_x, f_normal_y, f_normal_z, \
3878  f_nbond_x, f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z, \
3879  atomFixed, numAtoms, maxForceNumber \
3880  )
3881  if ( doVelRescaling && doFixedAtoms) CALL(true, true);
3882  else if ( doVelRescaling && !doFixedAtoms) CALL(true, false);
3883  else if (!doVelRescaling && doFixedAtoms) CALL(false, true);
3884  else if (!doVelRescaling && !doFixedAtoms) CALL(false, false);
3885  else {
3886  NAMD_bug("No kernel was called in SequencerCUDAKernel::velocityVerlet1!\n");
3887  }
3888 #undef CALL
3889 }
3890 
3891 void SequencerCUDAKernel::submitReduction1(
3892  double *pos_x,
3893  double *pos_y,
3894  double *pos_z,
3895  double *vel_x,
3896  double *vel_y,
3897  double *vel_z,
3898  float *mass,
3899  BigReal *kineticEnergy,
3900  BigReal *momentum_x,
3901  BigReal *momentum_y,
3902  BigReal *momentum_z,
3903  BigReal *angularMomentum_x,
3904  BigReal *angularMomentum_y,
3905  BigReal *angularMomentum_z,
3906  BigReal origin_x,
3907  BigReal origin_y,
3908  BigReal origin_z,
3909  BigReal *h_kineticEnergy,
3910  BigReal *h_momentum_x,
3911  BigReal *h_momentum_y,
3912  BigReal *h_momentum_z,
3913  BigReal *h_angularMomentum_x,
3914  BigReal *h_angularMomentum_y,
3915  BigReal *h_angularMomentum_z,
3916  unsigned int* tbcatomic,
3917  int numAtoms,
3918  cudaStream_t stream)
3919 {
3920  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3921  submitReduction1Kernel<ATOM_BLOCKS><<<grid, ATOM_BLOCKS, 0, stream>>>(
3922  pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass,
3923  kineticEnergy, momentum_x, momentum_y, momentum_z,
3924  angularMomentum_x, angularMomentum_y, angularMomentum_z,
3925  origin_x, origin_y, origin_z, h_kineticEnergy, h_momentum_x, h_momentum_y,
3926  h_momentum_z, h_angularMomentum_x, h_angularMomentum_y, h_angularMomentum_z,
3927  tbcatomic, numAtoms);
3928  //cudaCheck(cudaGetLastError());
3929 }
3930 
3931 template <int BLOCK_SIZE, bool DO_FIXED, bool DO_MTS>
3932 __global__ void submitReduction2Kernel(
3933  const int* __restrict atomFixed,
3934  const double * __restrict pos_x,
3935  const double * __restrict pos_y,
3936  const double * __restrict pos_z,
3937  const double * __restrict vel_x,
3938  const double * __restrict vel_y,
3939  const double * __restrict vel_z,
3940  const double * __restrict rcm_x,
3941  const double * __restrict rcm_y,
3942  const double * __restrict rcm_z,
3943  const double * __restrict vcm_x,
3944  const double * __restrict vcm_y,
3945  const double * __restrict vcm_z,
3946  const double * __restrict f_normal_x,
3947  const double * __restrict f_normal_y,
3948  const double * __restrict f_normal_z,
3949  const double * __restrict f_nbond_x,
3950  const double * __restrict f_nbond_y,
3951  const double * __restrict f_nbond_z,
3952  const double * __restrict f_slow_x,
3953  const double * __restrict f_slow_y,
3954  const double * __restrict f_slow_z,
3955  const float * __restrict mass,
3956  const int * __restrict hydrogenGroupSize,
3957  BigReal *kineticEnergy,
3958  BigReal *h_kineticEnergy,
3959  BigReal *intKineticEnergy,
3960  BigReal *h_intKineticEnergy,
3961  cudaTensor *intVirialNormal,
3962  cudaTensor *intVirialNbond,
3963  cudaTensor *intVirialSlow,
3964  cudaTensor *h_intVirialNormal,
3965  cudaTensor *h_intVirialNbond,
3966  cudaTensor *h_intVirialSlow,
3967  cudaTensor* rigidVirial,
3968  cudaTensor* h_rigidVirial,
3969  unsigned int *tbcatomic,
3970  const int numAtoms,
3971  const int maxForceNumber)
3972 {
3973  BigReal K = 0;
3974  BigReal intK = 0;
3975  cudaTensor intVNormal, intVNbond, intVSlow;
3976  zero_cudaTensor(intVNormal);
3977  if (DO_MTS) {
3978  zero_cudaTensor(intVNbond);
3979  zero_cudaTensor(intVSlow);
3980  }
3981  int i = threadIdx.x + blockIdx.x*blockDim.x;
3982  __shared__ bool isLastBlockDone;
3983  int totaltb = gridDim.x;
3984 
3985  if(threadIdx.x == 0){
3986  isLastBlockDone = false;
3987  }
3988  __syncthreads();
3989 
3990  if (i < numAtoms) {
3991 #if 0
3992  int hgs = hydrogenGroupSize[i];
3993  // let's see, I have the hydrogenGroupSize, but that's not what I need;
3994  if (hgs > 0) {
3995  int j;
3996  BigReal m_cm = 0;
3997  BigReal r_cm_x = 0;
3998  BigReal r_cm_y = 0;
3999  BigReal r_cm_z = 0;
4000  BigReal v_cm_x = 0;
4001  BigReal v_cm_y = 0;
4002  BigReal v_cm_z = 0;
4003  for ( j = i; j < (i+hgs); ++j ) {
4004  r_mass[j -i ] = mass[j];
4005  r_pos[j - i].x = pos_x[j];
4006  r_pos[j - i].y = pos_y[j];
4007  r_pos[j - i].z = pos_z[j];
4008  r_vel[j - i].x = vel_x[j];
4009  r_vel[j - i].y = vel_y[j];
4010  r_vel[j - i].z = vel_z[j];
4011  // m_cm += mass[j];
4012  // r_cm_x += mass[j] * pos_x[j];
4013  // r_cm_y += mass[j] * pos_y[j];
4014  // r_cm_z += mass[j] * pos_z[j];
4015  // v_cm_x += mass[j] * vel_x[j];
4016  // v_cm_y += mass[j] * vel_y[j];
4017  // v_cm_z += mass[j] * vel_z[j];
4018  m_cm += r_mass[j - i];
4019  r_cm_x += r_mass[j - i] * r_pos[j-i].x;
4020  r_cm_y += r_mass[j - i] * r_pos[j-i].y;
4021  r_cm_z += r_mass[j - i] * r_pos[j-i].z;
4022  v_cm_x += r_mass[j - i] * r_vel[j-i].x;
4023  v_cm_y += r_mass[j - i] * r_vel[j-i].y;
4024  v_cm_z += r_mass[j - i] * r_vel[j-i].z;
4025  }
4026  BigReal inv_m_cm = 1.0/m_cm;
4027  r_cm_x *= inv_m_cm;
4028  r_cm_y *= inv_m_cm;
4029  r_cm_z *= inv_m_cm;
4030  v_cm_x *= inv_m_cm;
4031  v_cm_y *= inv_m_cm;
4032  v_cm_z *= inv_m_cm;
4033 
4034  // XXX removed pairInteraction
4035  for ( j = i; j < (i+hgs); ++j ) {
4036  // XXX removed fixed atoms
4037 
4038  // vector vel[j] used twice below
4039  BigReal v_x = r_vel[j-i].x;
4040  BigReal v_y = r_vel[j-i].y;
4041  BigReal v_z = r_vel[j-i].z;
4042 
4043  // vector dv = vel[j] - v_cm
4044  BigReal dv_x = v_x - v_cm_x;
4045  BigReal dv_y = v_y - v_cm_y;
4046  BigReal dv_z = v_z - v_cm_z;
4047 
4048  // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4049  //intK += mass[j] *
4050  // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4051  intK += r_mass[j-i] *
4052  (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4053 
4054  // vector dr = pos[j] - r_cm
4055  // BigReal dr_x = pos_x[j] - r_cm_x;
4056  // BigReal dr_y = pos_y[j] - r_cm_y;
4057  // BigReal dr_z = pos_z[j] - r_cm_z;
4058 
4059  BigReal dr_x = r_pos[j -i].x - r_cm_x;
4060  BigReal dr_y = r_pos[j -i].y - r_cm_y;
4061  BigReal dr_z = r_pos[j -i].z - r_cm_z;
4062 
4063  // tensor intVirialNormal += outer_product(f_normal[j], dr)
4064 
4065  // we're not going to make this function any faster if we don't fix
4066  // the global memory access pattern
4067  intVNormal.xx += f_normal_x[j] * dr_x;
4068  intVNormal.xy += f_normal_x[j] * dr_y;
4069  intVNormal.xz += f_normal_x[j] * dr_z;
4070  intVNormal.yx += f_normal_y[j] * dr_x;
4071  intVNormal.yy += f_normal_y[j] * dr_y;
4072  intVNormal.yz += f_normal_y[j] * dr_z;
4073  intVNormal.zx += f_normal_z[j] * dr_x;
4074  intVNormal.zy += f_normal_z[j] * dr_y;
4075  intVNormal.zz += f_normal_z[j] * dr_z;
4076 
4077  if (maxForceNumber >= 1) {
4078  // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4079  intVNbond.xx += f_nbond_x[j] * dr_x;
4080  intVNbond.xy += f_nbond_x[j] * dr_y;
4081  intVNbond.xz += f_nbond_x[j] * dr_z;
4082  intVNbond.yx += f_nbond_y[j] * dr_x;
4083  intVNbond.yy += f_nbond_y[j] * dr_y;
4084  intVNbond.yz += f_nbond_y[j] * dr_z;
4085  intVNbond.zx += f_nbond_z[j] * dr_x;
4086  intVNbond.zy += f_nbond_z[j] * dr_y;
4087  intVNbond.zz += f_nbond_z[j] * dr_z;
4088  }
4089 
4090  if (maxForceNumber >= 2) {
4091  // tensor intVirialSlow += outer_product(f_slow[j], dr)
4092  intVSlow.xx += f_slow_x[j] * dr_x;
4093  intVSlow.xy += f_slow_x[j] * dr_y;
4094  intVSlow.xz += f_slow_x[j] * dr_z;
4095  intVSlow.yx += f_slow_y[j] * dr_x;
4096  intVSlow.yy += f_slow_y[j] * dr_y;
4097  intVSlow.yz += f_slow_y[j] * dr_z;
4098  intVSlow.zx += f_slow_z[j] * dr_x;
4099  intVSlow.zy += f_slow_z[j] * dr_y;
4100  intVSlow.zz += f_slow_z[j] * dr_z;
4101  }
4102  }
4103  }
4104 #else
4105  // Haochuan: I need to skip the fixed atoms
4106  if (!DO_FIXED || (DO_FIXED && !atomFixed[i])) {
4107  BigReal r_cm_x = rcm_x[i];
4108  BigReal r_cm_y = rcm_y[i];
4109  BigReal r_cm_z = rcm_z[i];
4110  BigReal v_cm_x = vcm_x[i];
4111  BigReal v_cm_y = vcm_y[i];
4112  BigReal v_cm_z = vcm_z[i];
4113 
4114  BigReal v_x = vel_x[i];
4115  BigReal v_y = vel_y[i];
4116  BigReal v_z = vel_z[i];
4117 
4118  // vector dv = vel[j] - v_cm
4119  BigReal dv_x = v_x - v_cm_x;
4120  BigReal dv_y = v_y - v_cm_y;
4121  BigReal dv_z = v_z - v_cm_z;
4122 
4123  // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4124  //intK += mass[j] *
4125  // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4126  K += mass[i] *
4127  (v_x * v_x + v_y*v_y + v_z*v_z);
4128  intK += mass[i] *
4129  (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4130 
4131  // vector dr = pos[j] - r_cm
4132  // BigReal dr_x = pos_x[j] - r_cm_x;
4133  // BigReal dr_y = pos_y[j] - r_cm_y;
4134  // BigReal dr_z = pos_z[j] - r_cm_z;
4135 
4136  BigReal dr_x = pos_x[i] - r_cm_x;
4137  BigReal dr_y = pos_y[i] - r_cm_y;
4138  BigReal dr_z = pos_z[i] - r_cm_z;
4139 
4140  // tensor intVirialNormal += outer_product(f_normal[j], dr)
4141 
4142  // we're not going to make this function any faster if we don't fix
4143  // the global memory access pattern
4144  intVNormal.xx += f_normal_x[i] * dr_x;
4145  intVNormal.xy += f_normal_x[i] * dr_y;
4146  intVNormal.xz += f_normal_x[i] * dr_z;
4147  intVNormal.yx += f_normal_y[i] * dr_x;
4148  intVNormal.yy += f_normal_y[i] * dr_y;
4149  intVNormal.yz += f_normal_y[i] * dr_z;
4150  intVNormal.zx += f_normal_z[i] * dr_x;
4151  intVNormal.zy += f_normal_z[i] * dr_y;
4152  intVNormal.zz += f_normal_z[i] * dr_z;
4153 
4154  if (maxForceNumber >= 1) {
4155  // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4156  // Haochuan: For a MTS simulation we have to calculate intVNbond separately
4157  if (DO_MTS) {
4158  intVNbond.xx += f_nbond_x[i] * dr_x;
4159  intVNbond.xy += f_nbond_x[i] * dr_y;
4160  intVNbond.xz += f_nbond_x[i] * dr_z;
4161  intVNbond.yx += f_nbond_y[i] * dr_x;
4162  intVNbond.yy += f_nbond_y[i] * dr_y;
4163  intVNbond.yz += f_nbond_y[i] * dr_z;
4164  intVNbond.zx += f_nbond_z[i] * dr_x;
4165  intVNbond.zy += f_nbond_z[i] * dr_y;
4166  intVNbond.zz += f_nbond_z[i] * dr_z;
4167  } else {
4168  // If this is not a MTS simulation, then we can sum the internal virials into a single one (intVNormal)
4169  intVNormal.xx += f_nbond_x[i] * dr_x;
4170  intVNormal.xy += f_nbond_x[i] * dr_y;
4171  intVNormal.xz += f_nbond_x[i] * dr_z;
4172  intVNormal.yx += f_nbond_y[i] * dr_x;
4173  intVNormal.yy += f_nbond_y[i] * dr_y;
4174  intVNormal.yz += f_nbond_y[i] * dr_z;
4175  intVNormal.zx += f_nbond_z[i] * dr_x;
4176  intVNormal.zy += f_nbond_z[i] * dr_y;
4177  intVNormal.zz += f_nbond_z[i] * dr_z;
4178  }
4179  }
4180 
4181  if (maxForceNumber >= 2) {
4182  // tensor intVirialSlow += outer_product(f_slow[j], dr)
4183  if (DO_MTS) {
4184  intVSlow.xx += f_slow_x[i] * dr_x;
4185  intVSlow.xy += f_slow_x[i] * dr_y;
4186  intVSlow.xz += f_slow_x[i] * dr_z;
4187  intVSlow.yx += f_slow_y[i] * dr_x;
4188  intVSlow.yy += f_slow_y[i] * dr_y;
4189  intVSlow.yz += f_slow_y[i] * dr_z;
4190  intVSlow.zx += f_slow_z[i] * dr_x;
4191  intVSlow.zy += f_slow_z[i] * dr_y;
4192  intVSlow.zz += f_slow_z[i] * dr_z;
4193  } else {
4194  intVNormal.xx += f_slow_x[i] * dr_x;
4195  intVNormal.xy += f_slow_x[i] * dr_y;
4196  intVNormal.xz += f_slow_x[i] * dr_z;
4197  intVNormal.yx += f_slow_y[i] * dr_x;
4198  intVNormal.yy += f_slow_y[i] * dr_y;
4199  intVNormal.yz += f_slow_y[i] * dr_z;
4200  intVNormal.zx += f_slow_z[i] * dr_x;
4201  intVNormal.zy += f_slow_z[i] * dr_y;
4202  intVNormal.zz += f_slow_z[i] * dr_z;
4203  }
4204  }
4205  }
4206 #endif
4207  }
4208 
4209  typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
4210  __shared__ typename BlockReduce::TempStorage temp_storage;
4211  // XXX TODO: If we handwrite these reductions, it might avoid a
4212  // lot of overhead from launching CUB
4213 
4214  K = BlockReduce(temp_storage).Sum(K);
4215  __syncthreads();
4216  intK = BlockReduce(temp_storage).Sum(intK);
4217  __syncthreads();
4218  intVNormal.xx = BlockReduce(temp_storage).Sum(intVNormal.xx);
4219  __syncthreads();
4220  intVNormal.xy = BlockReduce(temp_storage).Sum(intVNormal.xy);
4221  __syncthreads();
4222  intVNormal.xz = BlockReduce(temp_storage).Sum(intVNormal.xz);
4223  __syncthreads();
4224  intVNormal.yx = BlockReduce(temp_storage).Sum(intVNormal.yx);
4225  __syncthreads();
4226  intVNormal.yy = BlockReduce(temp_storage).Sum(intVNormal.yy);
4227  __syncthreads();
4228  intVNormal.yz = BlockReduce(temp_storage).Sum(intVNormal.yz);
4229  __syncthreads();
4230  intVNormal.zx = BlockReduce(temp_storage).Sum(intVNormal.zx);
4231  __syncthreads();
4232  intVNormal.zy = BlockReduce(temp_storage).Sum(intVNormal.zy);
4233  __syncthreads();
4234  intVNormal.zz = BlockReduce(temp_storage).Sum(intVNormal.zz);
4235  __syncthreads();
4236  if (DO_MTS) {
4237  if (maxForceNumber >= 1) {
4238  intVNbond.xx = BlockReduce(temp_storage).Sum(intVNbond.xx);
4239  __syncthreads();
4240  intVNbond.xy = BlockReduce(temp_storage).Sum(intVNbond.xy);
4241  __syncthreads();
4242  intVNbond.xz = BlockReduce(temp_storage).Sum(intVNbond.xz);
4243  __syncthreads();
4244  intVNbond.yx = BlockReduce(temp_storage).Sum(intVNbond.yx);
4245  __syncthreads();
4246  intVNbond.yy = BlockReduce(temp_storage).Sum(intVNbond.yy);
4247  __syncthreads();
4248  intVNbond.yz = BlockReduce(temp_storage).Sum(intVNbond.yz);
4249  __syncthreads();
4250  intVNbond.zx = BlockReduce(temp_storage).Sum(intVNbond.zx);
4251  __syncthreads();
4252  intVNbond.zy = BlockReduce(temp_storage).Sum(intVNbond.zy);
4253  __syncthreads();
4254  intVNbond.zz = BlockReduce(temp_storage).Sum(intVNbond.zz);
4255  __syncthreads();
4256  }
4257  if (maxForceNumber >= 2) {
4258  intVSlow.xx = BlockReduce(temp_storage).Sum(intVSlow.xx);
4259  __syncthreads();
4260  intVSlow.xy = BlockReduce(temp_storage).Sum(intVSlow.xy);
4261  __syncthreads();
4262  intVSlow.xz = BlockReduce(temp_storage).Sum(intVSlow.xz);
4263  __syncthreads();
4264  intVSlow.yx = BlockReduce(temp_storage).Sum(intVSlow.yx);
4265  __syncthreads();
4266  intVSlow.yy = BlockReduce(temp_storage).Sum(intVSlow.yy);
4267  __syncthreads();
4268  intVSlow.yz = BlockReduce(temp_storage).Sum(intVSlow.yz);
4269  __syncthreads();
4270  intVSlow.zx = BlockReduce(temp_storage).Sum(intVSlow.zx);
4271  __syncthreads();
4272  intVSlow.zy = BlockReduce(temp_storage).Sum(intVSlow.zy);
4273  __syncthreads();
4274  intVSlow.zz = BlockReduce(temp_storage).Sum(intVSlow.zz);
4275  __syncthreads();
4276  }
4277  }
4278 
4279 
4280  if (threadIdx.x == 0) {
4281  const int bin = blockIdx.x % ATOMIC_BINS;
4282 
4283  atomicAdd(&kineticEnergy[bin], K);
4284  atomicAdd(&intKineticEnergy[bin], intK);
4285  atomicAdd(&intVirialNormal[bin].xx, intVNormal.xx);
4286  atomicAdd(&intVirialNormal[bin].xy, intVNormal.xy);
4287  atomicAdd(&intVirialNormal[bin].xz, intVNormal.xz);
4288  atomicAdd(&intVirialNormal[bin].yx, intVNormal.yx);
4289  atomicAdd(&intVirialNormal[bin].yy, intVNormal.yy);
4290  atomicAdd(&intVirialNormal[bin].yz, intVNormal.yz);
4291  atomicAdd(&intVirialNormal[bin].zx, intVNormal.zx);
4292  atomicAdd(&intVirialNormal[bin].zy, intVNormal.zy);
4293  atomicAdd(&intVirialNormal[bin].zz, intVNormal.zz);
4294  if (DO_MTS) {
4295  if (maxForceNumber >= 1) {
4296  atomicAdd(&intVirialNbond[bin].xx, intVNbond.xx);
4297  atomicAdd(&intVirialNbond[bin].xy, intVNbond.xy);
4298  atomicAdd(&intVirialNbond[bin].xz, intVNbond.xz);
4299  atomicAdd(&intVirialNbond[bin].yx, intVNbond.yx);
4300  atomicAdd(&intVirialNbond[bin].yy, intVNbond.yy);
4301  atomicAdd(&intVirialNbond[bin].yz, intVNbond.yz);
4302  atomicAdd(&intVirialNbond[bin].zx, intVNbond.zx);
4303  atomicAdd(&intVirialNbond[bin].zy, intVNbond.zy);
4304  atomicAdd(&intVirialNbond[bin].zz, intVNbond.zz);
4305  }
4306  if (maxForceNumber >= 2) {
4307  atomicAdd(&intVirialSlow[bin].xx, intVSlow.xx);
4308  atomicAdd(&intVirialSlow[bin].xy, intVSlow.xy);
4309  atomicAdd(&intVirialSlow[bin].xz, intVSlow.xz);
4310  atomicAdd(&intVirialSlow[bin].yx, intVSlow.yx);
4311  atomicAdd(&intVirialSlow[bin].yy, intVSlow.yy);
4312  atomicAdd(&intVirialSlow[bin].yz, intVSlow.yz);
4313  atomicAdd(&intVirialSlow[bin].zx, intVSlow.zx);
4314  atomicAdd(&intVirialSlow[bin].zy, intVSlow.zy);
4315  atomicAdd(&intVirialSlow[bin].zz, intVSlow.zz);
4316  }
4317  }
4318  __threadfence();
4319  unsigned int value = atomicInc(&tbcatomic[3], totaltb);
4320  isLastBlockDone = (value == (totaltb -1 ));
4321  }
4322  __syncthreads();
4323  // this function calculates the internal pressures and is a huge bottleneck if we
4324  // do this host-mapped update
4325  // How do I know if this is really the bottleneck?
4326  if(isLastBlockDone){
4327  if(threadIdx.x < ATOMIC_BINS){
4328  const int bin = threadIdx.x;
4329 
4330  double k = kineticEnergy[bin];
4331  double intK = intKineticEnergy[bin];
4332  cudaTensor intVNormal = intVirialNormal[bin];
4333  if (DO_MTS) {
4334  if (maxForceNumber >= 1) {
4335  intVNbond = intVirialNbond[bin];
4336  }
4337  if (maxForceNumber >= 2) {
4338  intVSlow = intVirialSlow[bin];
4339  }
4340  }
4341  cudaTensor rigidV = rigidVirial[bin];
4342 
4343  // sets device scalars back to zero
4344  kineticEnergy[bin] = 0.0;
4345  intKineticEnergy[bin] = 0.0;
4346  zero_cudaTensor(intVirialNormal[bin]);
4347  zero_cudaTensor(intVirialNbond[bin]);
4348  zero_cudaTensor(intVirialSlow[bin]);
4349  zero_cudaTensor(rigidVirial[bin]);
4350 
4351  if(ATOMIC_BINS > 1){
4352  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
4353  typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
4354  __shared__ typename WarpReduce::TempStorage tempStorage;
4355  __shared__ typename WarpReduceT::TempStorage tempStorageT;
4356 
4357  // According to https://nvidia.github.io/cccl/cub/developer_overview.html#temporary-storage-usage,
4358  // I should use __syncwarp() here.
4359  k = WarpReduce(tempStorage).Sum(k); NAMD_WARP_SYNC(WARP_FULL_MASK);
4360  intK = WarpReduce(tempStorage).Sum(intK); NAMD_WARP_SYNC(WARP_FULL_MASK);
4361  intVNormal = WarpReduceT(tempStorageT).Sum(intVNormal); NAMD_WARP_SYNC(WARP_FULL_MASK);
4362  rigidV = WarpReduceT(tempStorageT).Sum(rigidV); NAMD_WARP_SYNC(WARP_FULL_MASK);
4363  if (DO_MTS) {
4364  if (maxForceNumber >= 1) {
4365  intVNbond = WarpReduceT(tempStorageT).Sum(intVNbond); NAMD_WARP_SYNC(WARP_FULL_MASK);
4366  }
4367  if (maxForceNumber >= 2) {
4368  intVSlow = WarpReduceT(tempStorageT).Sum(intVSlow); NAMD_WARP_SYNC(WARP_FULL_MASK);
4369  }
4370  }
4371  }
4372 
4373  if(threadIdx.x == 0){
4374  h_kineticEnergy[0] = k;
4375  h_intKineticEnergy[0] = intK;
4376  h_intVirialNormal[0] = intVNormal;
4377  if (DO_MTS) {
4378  if (maxForceNumber >= 1) {
4379  h_intVirialNbond[0] = intVNbond;
4380  }
4381  if (maxForceNumber >= 2) {
4382  h_intVirialSlow[0] = intVSlow;
4383  }
4384  }
4385  h_rigidVirial[0] = rigidV;
4386 
4387  //resets atomic counter
4388  reset_atomic_counter(&tbcatomic[3]);
4389  }
4390  }
4391  }
4392 }
4393 
4394 void SequencerCUDAKernel::submitReduction2(
4395  const bool doFixedAtoms,
4396  const int* atomFixed,
4397  const double *pos_x,
4398  const double *pos_y,
4399  const double *pos_z,
4400  const double *vel_x,
4401  const double *vel_y,
4402  const double *vel_z,
4403  const double *rcm_x,
4404  const double *rcm_y,
4405  const double *rcm_z,
4406  const double *vcm_x,
4407  const double *vcm_y,
4408  const double *vcm_z,
4409  const double *f_normal_x,
4410  const double *f_normal_y,
4411  const double *f_normal_z,
4412  const double *f_nbond_x,
4413  const double *f_nbond_y,
4414  const double *f_nbond_z,
4415  const double *f_slow_x,
4416  const double *f_slow_y,
4417  const double *f_slow_z,
4418  float *mass,
4419  int *hydrogenGroupSize,
4420  BigReal *kineticEnergy,
4421  BigReal *h_kineticEnergy,
4422  BigReal *intKineticEnergy,
4423  BigReal *h_intKineticEnergy,
4424  cudaTensor *intVirialNormal,
4425  cudaTensor *intVirialNbond,
4426  cudaTensor *intVirialSlow,
4427  cudaTensor *h_intVirialNormal,
4428  cudaTensor *h_intVirialNbond,
4429  cudaTensor *h_intVirialSlow,
4430  cudaTensor* rigidVirial,
4431  cudaTensor* h_rigidVirial,
4432  unsigned int *tbcatomic,
4433  int numAtoms,
4434  int maxForceNumber,
4435  const bool doMTS,
4436  cudaStream_t stream)
4437 {
4438 
4439  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4440 #define CALL(DO_FIXED, DO_MTS) \
4441  submitReduction2Kernel<ATOM_BLOCKS, DO_FIXED, DO_MTS><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4442  atomFixed, pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, \
4443  rcm_x, rcm_y, rcm_z, vcm_x, vcm_y, vcm_z, \
4444  f_normal_x, f_normal_y, f_normal_z, \
4445  f_nbond_x, f_nbond_y, f_nbond_z, \
4446  f_slow_x, f_slow_y, f_slow_z, \
4447  mass, hydrogenGroupSize, \
4448  kineticEnergy, h_kineticEnergy, \
4449  intKineticEnergy, h_intKineticEnergy, \
4450  intVirialNormal, intVirialNbond, intVirialSlow, \
4451  h_intVirialNormal, h_intVirialNbond, h_intVirialSlow, rigidVirial, \
4452  h_rigidVirial, tbcatomic, numAtoms, maxForceNumber)
4453  if (doFixedAtoms) {
4454  if (doMTS) CALL(true, true);
4455  else CALL(true, false);
4456  }
4457  else if (!doFixedAtoms) {
4458  if (doMTS) CALL(false, true);
4459  else CALL(false, false);
4460  }
4461  else NAMD_bug("No kernel was called in SequencerCUDAKernel::submitReduction2!\n");
4462 #undef CALL
4463  // cudaCheck(cudaGetLastError());
4464 }
4465 
4466 __global__ void langevinVelocitiesBBK1Kernel(
4467  BigReal timestep,
4468  const float * __restrict langevinParam,
4469  double * __restrict vel_x,
4470  double * __restrict vel_y,
4471  double * __restrict vel_z,
4472  int numAtoms)
4473 {
4474  BigReal dt = timestep * (0.001 * TIMEFACTOR);
4475  int i = threadIdx.x + blockIdx.x*blockDim.x;
4476  if (i < numAtoms) {
4477  BigReal dt_gamma = dt * langevinParam[i];
4478  BigReal scaling = 1. - 0.5 * dt_gamma;
4479  vel_x[i] *= scaling;
4480  vel_y[i] *= scaling;
4481  vel_z[i] *= scaling;
4482  }
4483 }
4484 
4485 void SequencerCUDAKernel::langevinVelocitiesBBK1(
4486  BigReal timestep,
4487  const float *langevinParam,
4488  double *vel_x,
4489  double *vel_y,
4490  double *vel_z,
4491  int numAtoms,
4492  cudaStream_t stream)
4493 {
4494  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4495  langevinVelocitiesBBK1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4496  timestep, langevinParam, vel_x, vel_y, vel_z, numAtoms);
4497  // cudaCheck(cudaGetLastError());
4498 }
4499 
4500 __global__ void langevinVelocitiesBBK2Kernel(
4501  BigReal timestep,
4502  const float * __restrict langScalVelBBK2,
4503  const float * __restrict langScalRandBBK2,
4504  const float * __restrict gaussrand_x,
4505  const float * __restrict gaussrand_y,
4506  const float * __restrict gaussrand_z,
4507  double * __restrict vel_x,
4508  double * __restrict vel_y,
4509  double * __restrict vel_z,
4510  const int numAtoms)
4511 {
4512  int i = threadIdx.x + blockIdx.x*blockDim.x;
4513  if (i < numAtoms) {
4514  vel_x[i] += gaussrand_x[i] * langScalRandBBK2[i];
4515  vel_y[i] += gaussrand_y[i] * langScalRandBBK2[i];
4516  vel_z[i] += gaussrand_z[i] * langScalRandBBK2[i];
4517  vel_x[i] *= langScalVelBBK2[i];
4518  vel_y[i] *= langScalVelBBK2[i];
4519  vel_z[i] *= langScalVelBBK2[i];
4520  }
4521 }
4522 
4523 
4524 
4525 void SequencerCUDAKernel::langevinVelocitiesBBK2(
4526  BigReal timestep,
4527  const float *langScalVelBBK2,
4528  const float *langScalRandBBK2,
4529  float *gaussrand_x,
4530  float *gaussrand_y,
4531  float *gaussrand_z,
4532  double *vel_x,
4533  double *vel_y,
4534  double *vel_z,
4535  const int numAtoms,
4536  const int numAtomsGlobal,
4537  const int stride,
4538  curandGenerator_t gen,
4539  cudaStream_t stream)
4540 {
4541 
4542  // For some reason, if n == nAtomsDevice + 1, this gives me a misaligned address
4543  // Generating the full array on gaussrand without striding for multiple GPU simulations for now
4544 
4545  // Adding missing call to hiprandSetStream to set the current stream for HIPRAND kernel launches
4546  curandCheck(curandSetStream(gen, stream));
4547  // array buffers have to be even length for pseudorandom normal distribution
4548  // choose n to be the smallest even number >= numAtoms
4549  // we can choose 1 larger than numAtoms since allocation is > numAtoms
4550  int n = (numAtomsGlobal + 1) & (~1);
4551 
4552  curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4553  curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4554  curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4555  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4556  langevinVelocitiesBBK2Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4557  timestep, langScalVelBBK2, langScalRandBBK2,
4558  gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride,
4559  vel_x, vel_y, vel_z, numAtoms);
4560  // cudaCheck(cudaGetLastError());
4561 }
4562 
4563 template <bool doFixed>
4564 __global__ void reassignVelocitiesKernel(
4565  const BigReal timestep,
4566  const int* __restrict atomFixed,
4567  const float * __restrict gaussrand_x,
4568  const float * __restrict gaussrand_y,
4569  const float * __restrict gaussrand_z,
4570  double * __restrict vel_x,
4571  double * __restrict vel_y,
4572  double * __restrict vel_z,
4573  const double* __restrict d_recipMass,
4574  const BigReal kbT,
4575  const int numAtoms)
4576 {
4577  //! note we are NOT supporting LES (locally enhanced sampling) here
4578  //! hence no switch on the partition to apply tempfactor
4579 
4580  /* */
4581  int i = threadIdx.x + blockIdx.x*blockDim.x;
4582  if (i < numAtoms) {
4583  if (doFixed) {
4584  if (atomFixed[i]) {
4585  vel_x[i] = 0.0;
4586  vel_y[i] = 0.0;
4587  vel_z[i] = 0.0;
4588  } else {
4589  vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4590  vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4591  vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4592  }
4593  } else {
4594  vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4595  vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4596  vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4597  }
4598  }
4599  //__syncthreads();
4600 }
4601 
4602 void SequencerCUDAKernel::reassignVelocities(
4603  const BigReal timestep,
4604  const bool doFixedAtoms,
4605  const int* atomFixed,
4606  float *gaussrand_x,
4607  float *gaussrand_y,
4608  float *gaussrand_z,
4609  double *vel_x,
4610  double *vel_y,
4611  double *vel_z,
4612  const double* d_recipMass,
4613  const BigReal kbT,
4614  const int numAtoms,
4615  const int numAtomsGlobal,
4616  const int stride,
4617  curandGenerator_t gen,
4618  cudaStream_t stream)
4619 {
4620 
4621  curandCheck(curandSetStream(gen, stream));
4622  // array buffers have to be even length for pseudorandom normal distribution
4623  // choose n to be the smallest even number >= numAtoms
4624  // we can choose 1 larger than numAtoms since allocation is > numAtoms
4625  int n = (numAtomsGlobal + 1) & (~1);
4626 
4627  curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4628  curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4629  curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4630  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4631 #define CALL(DOFIXED) \
4632  reassignVelocitiesKernel<DOFIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4633  timestep, atomFixed, \
4634  gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride, \
4635  vel_x, vel_y, vel_z, d_recipMass, kbT, numAtoms)
4636  if (doFixedAtoms) CALL(true);
4637  else CALL(false);
4638 #undef CALL
4639  cudaCheck(cudaStreamSynchronize(stream));
4640 
4641 }
4642 
4643 
4644 
4645 __global__ void updateVelocities(const int nRattles,
4646  const int nSettles,
4647  const float invdt,
4648  const int * __restrict settleList,
4649  const CudaRattleElem * __restrict rattleList,
4650  const double * __restrict pos_x,
4651  const double * __restrict pos_y,
4652  const double * __restrict pos_z,
4653  const double * __restrict posNew_x,
4654  const double * __restrict posNew_y,
4655  const double * __restrict posNew_z,
4656  double * velNew_x,
4657  double * velNew_y,
4658  double * velNew_z){
4659 
4660  int tid = threadIdx.x + blockIdx.x*blockDim.x;
4661  int stride = gridDim.x*blockDim.x;
4662 
4663 #if 0
4664  for(int i = tid; i < nSettles; i += stride){
4665  int ig = settleList[i];
4666  //Now I update the position
4667  //Updates three atoms in the settleList
4668  velNew_x[ig] = (posNew_x[ig] - pos_x[ig]) * invdt;
4669  velNew_y[ig] = (posNew_y[ig] - pos_y[ig]) * invdt;
4670  velNew_z[ig] = (posNew_z[ig] - pos_z[ig]) * invdt;
4671 
4672  velNew_x[ig+1] = (posNew_x[ig+1] - pos_x[ig+1]) * invdt;
4673  velNew_y[ig+1] = (posNew_y[ig+1] - pos_y[ig+1]) * invdt;
4674  velNew_z[ig+1] = (posNew_z[ig+1] - pos_z[ig+1]) * invdt;
4675 
4676  velNew_x[ig+2] = (posNew_x[ig+2] - pos_x[ig+2]) * invdt;
4677  velNew_y[ig+2] = (posNew_y[ig+2] - pos_y[ig+2]) * invdt;
4678  velNew_z[ig+2] = (posNew_z[ig+2] - pos_z[ig+2]) * invdt;
4679  }
4680 #endif
4681 
4682  for(int i = tid; i < nRattles; i += stride){
4683  int ig = rattleList[i].ig;
4684  int icnt = rattleList[i].icnt;
4685  bool fixed[4];
4686  fixed[0] = false; fixed[1] = false;
4687  fixed[2] = false; fixed[3] = false;
4688  for(int j = 0; j < icnt; j++){
4689  //Gets two positions
4690  int ia = rattleList[i].params[j].ia;
4691  int ib = rattleList[i].params[j].ib;
4692  //checks if any of these positions have been updated yet
4693  if(!fixed[ia]){
4694  velNew_x[ig+ia] = (posNew_x[ig+ia] - pos_x[ig+ia]) * invdt;
4695  velNew_y[ig+ia] = (posNew_y[ig+ia] - pos_y[ig+ia]) * invdt;
4696  velNew_z[ig+ia] = (posNew_z[ig+ia] - pos_z[ig+ia]) * invdt;
4697  fixed[ia] = true;
4698  }
4699  if(!fixed[ib]){
4700  velNew_x[ig+ib] = (posNew_x[ig+ib] - pos_x[ig+ib]) * invdt;
4701  velNew_y[ig+ib] = (posNew_y[ig+ib] - pos_y[ig+ib]) * invdt;
4702  velNew_z[ig+ib] = (posNew_z[ig+ib] - pos_z[ig+ib]) * invdt;
4703  fixed[ib] = true;
4704  }
4705  }
4706  }
4707 }
4708 
4709 //JM: Function that adds a correction to the bonded forces
4710 template<bool doEnergy, bool doFixed>
4711 __global__ void addRattleForce(int numAtoms,
4712  const double invdt,
4713  const int* __restrict atomFixed,
4714  const float * __restrict mass,
4715  const double * __restrict pos_x,
4716  const double * __restrict pos_y,
4717  const double * __restrict pos_z,
4718  double * __restrict vel_x,
4719  double * __restrict vel_y,
4720  double * __restrict vel_z,
4721  double * __restrict velNew_x,
4722  double * __restrict velNew_y,
4723  double * __restrict velNew_z,
4724  double * __restrict f_normal_x,
4725  double * __restrict f_normal_y,
4726  double * __restrict f_normal_z,
4727  cudaTensor* __restrict virial,
4728  cudaTensor* __restrict h_virial,
4729  unsigned int* tbcatomic){
4730 
4731  cudaTensor lVirial;
4732  // zero_cudaTensor(lVirial);
4733  int i = threadIdx.x + blockIdx.x*blockDim.x;
4734 
4735  double df[3];
4736  double pos[3];
4737  lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
4738  lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
4739  lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
4740  //for(int i = tid; i < numAtoms; i += stride){
4741  if(i < numAtoms){
4742  df[0] = (velNew_x[i] - vel_x[i]) * ((double)mass[i] * invdt);
4743  df[1] = (velNew_y[i] - vel_y[i]) * ((double)mass[i] * invdt);
4744  df[2] = (velNew_z[i] - vel_z[i]) * ((double)mass[i] * invdt);
4745  if(doEnergy){
4746  pos[0] = pos_x[i]; pos[1] = pos_y[i]; pos[2] = pos_z[i];
4747  lVirial.xx += df[0] * pos[0];
4748  lVirial.xy += df[0] * pos[1];
4749  lVirial.xz += df[0] * pos[2];
4750  lVirial.yx += df[1] * pos[0];
4751  lVirial.yy += df[1] * pos[1];
4752  lVirial.yz += df[1] * pos[2];
4753  lVirial.zx += df[2] * pos[0];
4754  lVirial.zy += df[2] * pos[1];
4755  lVirial.zz += df[2] * pos[2];
4756  }
4757 
4758  //Updates force and velocities
4759  f_normal_x[i] += df[0];
4760  f_normal_y[i] += df[1];
4761  f_normal_z[i] += df[2];
4762 
4763  // instead of copying I can swap the pointers
4764  vel_x[i] = velNew_x[i];
4765  vel_y[i] = velNew_y[i];
4766  vel_z[i] = velNew_z[i];
4767 
4768  }
4769  //Makes sure that every thread from block has its lVirial set
4770  // Now do a block reduce on each position of virialTensor
4771  // to get consistent values
4772  // The virial tensor is supposed to be symmetric
4773  // XXX TODO: Handwrite the reductions to avoid syncthreads
4774  if(doEnergy){
4775  typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
4776  __shared__ typename BlockReduce::TempStorage temp_storage;
4777  lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
4778  if (doFixed) {
4779  lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
4780  lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
4781  }
4782  lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
4783  lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
4784  __syncthreads();
4785  if (doFixed) {
4786  lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
4787  }
4788  lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
4789  lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
4790  lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
4791 
4792  // Every block has a locally reduced blockVirial
4793  // Now every thread does an atomicAdd to get a global virial
4794  if(threadIdx.x == 0){
4795  const int bin = blockIdx.x % ATOMIC_BINS;
4796 
4797  atomicAdd(&virial[bin].xx, lVirial.xx);
4798  if (doFixed) {
4799  atomicAdd(&virial[bin].xy, lVirial.xy);
4800  atomicAdd(&virial[bin].xz, lVirial.xz);
4801  }
4802  atomicAdd(&virial[bin].yx, lVirial.yx);
4803  atomicAdd(&virial[bin].yy, lVirial.yy);
4804  if (doFixed) {
4805  atomicAdd(&virial[bin].yz, lVirial.yz);
4806  }
4807  atomicAdd(&virial[bin].zx, lVirial.zx);
4808  atomicAdd(&virial[bin].zy, lVirial.zy);
4809  atomicAdd(&virial[bin].zz, lVirial.zz);
4810  }
4811  }
4812 }
4813 
4814 
4815 template <bool fillIndexes>
4816 __global__ void buildRattleParams(const int size,
4817  const float dt,
4818  const float invdt,
4819  const int *hgi,
4820  //Do I need new vectors for holding the rattle forces? Let's do this for now
4821  const float * __restrict mass,
4822  const int * __restrict hydrogenGroupSize,
4823  const float * __restrict rigidBondLength,
4824  const int * __restrict atomFixed,
4825  CudaRattleElem* rattleList,
4826  int *rattleIndexes)
4827 {
4828  int tid = threadIdx.x + blockIdx.x*blockDim.x;
4829  int stride = gridDim.x * blockDim.x;
4830  float rmass[10];
4831  float dsq[10];
4832  int fixed[10];
4833  int ial[10];
4834  int ibl[10];
4835  int ig, hgs, i;
4836  bool anyfixed;
4837 
4838  for(int k = tid; k < size; k += stride){
4839  if(!fillIndexes){
4840  i = rattleIndexes[k];
4841  if(i == -1) break;
4842  }else i = k;
4843  ig = hgi[i];
4844  hgs = hydrogenGroupSize[ig];
4845  if(hgs == 1) continue; //early bail. Sucks;
4846  int icnt = 0;
4847  anyfixed = false;
4848  for(int j = 0; j < hgs; j++){
4849  rmass[j] = (mass[ig + j] > 0.f ? __frcp_rn(mass[ig+j]) : 0.f);
4850  fixed[j] = atomFixed[ig+j];
4851  if(fixed[j]) {
4852  anyfixed = true;
4853  rmass[j] = 0.f;
4854  }
4855  }
4856  float tmp = rigidBondLength[ig];
4857  //We bail here if hydrogens are not fixed
4858  if(tmp > 0.f){
4859  if(!anyfixed) continue;
4860  if( !(fixed[1] && fixed[2]) ){
4861  dsq[icnt] = tmp*tmp;
4862  ial[icnt] = 1;
4863  ibl[icnt++] = 2;
4864  }
4865  }
4866  if(fillIndexes){
4867  rattleIndexes[i] = i;
4868  continue;
4869  }else{
4870  for(int j = 1; j < hgs; j++){
4871  if((tmp = rigidBondLength[ig+j]) > 0){
4872  if( !(fixed[0] && fixed[j])){
4873  dsq[icnt] = tmp * tmp;
4874  ial[icnt] = 0;
4875  ibl[icnt++] = j;
4876  }
4877  }
4878  }
4879  // This is really bad: does an atomicadd for each rattle
4880  // Improve this later
4881  rattleList[k].ig = ig;
4882  rattleList[k].icnt = icnt;
4883  for(int j = 0; j < icnt; j++){
4884  int ia = ial[j];
4885  int ib = ibl[j];
4886  rattleList[k].params[j].ia = ia;
4887  rattleList[k].params[j].ib = ib;
4888  rattleList[k].params[j].dsq = dsq[j];
4889  rattleList[k].params[j].rma = rmass[ia];
4890  rattleList[k].params[j].rmb = rmass[ib];
4891  }
4892  }
4893  }
4894 }
4895 
4896 void buildRattleLists(
4897  const bool doFixedAtoms,
4898  const int natoms,
4899  const float dt,
4900  const float invdt,
4901  const float *mass,
4902  const int *hydrogenGroupSize,
4903  const float *rigidBondLength,
4904  const int *atomFixed,
4905  int **settleList,
4906  size_t& settleListSize,
4907  int **consFailure,
4908  size_t& consFailureSize,
4909  CudaRattleElem **rattleList,
4910  size_t& rattleListSize,
4911  int &nSettles,
4912  int &nRattles,
4913  int*& d_nHG,
4914  int*& d_nSettles,
4915  int*& d_nRattles,
4916  int*& hgi,
4917  size_t& hgi_size,
4918  char*& d_rattleList_temp_storage,
4919  size_t& temp_storage_bytes,
4920  int*& rattleIndexes,
4921  size_t& rattleIndexes_size,
4922  bool first,
4923  cudaStream_t stream){
4924  size_t nHG = 0;
4925 
4926  //thrust::device_ptr<const int> h_dev = thrust::device_pointer_cast(hydrogenGroupSize)
4927  size_t st_hgi_size = (size_t)(hgi_size);
4928  reallocate_device<int>(&hgi, &st_hgi_size, natoms, OVERALLOC);
4929  hgi_size = (size_t)(st_hgi_size);
4930  size_t temp_length = 1;
4931  reallocate_device<int>(&d_nHG, &temp_length, 1);
4932  reallocate_device<int>(&d_nSettles, &temp_length, 1);
4933  reallocate_device<int>(&d_nRattles, &temp_length, 1);
4934 
4935  copy_DtoD<int>(hydrogenGroupSize, hgi, natoms, stream);
4936  size_t new_storage_bytes = 0;
4937  // Removes zeroes from HGI array
4938  // Pass NULL to first CUB call to get temp buffer size
4939  cudaCheck(cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, hgi,
4940  d_nHG, natoms, notZero(), stream));
4941  reallocate_device<char>(&d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4942  // DC: There were issues where reallocate wanted int types and CUB wanted unsigned ints
4943  new_storage_bytes = temp_storage_bytes;
4944  cudaCheck( cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, hgi,
4945  d_nHG, natoms, notZero(), stream));
4946 
4947  int temp;
4948  copy_DtoH<int>(d_nHG, &temp, 1, stream );
4949  nHG = temp;
4950 
4951  cudaCheck(cudaStreamSynchronize(stream));
4952 
4953  // Exclusive scan to build an array of indices
4954  new_storage_bytes = 0;
4955  cub::DeviceScan::ExclusiveSum(NULL,
4956  new_storage_bytes, hgi, hgi, nHG, stream);
4957  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4958  new_storage_bytes = temp_storage_bytes;
4959  cub::DeviceScan::ExclusiveSum(d_rattleList_temp_storage, new_storage_bytes,
4960  hgi, hgi, nHG, stream);
4961 
4962  size_t st_tempListSize = (size_t)(settleListSize);
4963  reallocate_device<int>(settleList, &st_tempListSize, nHG, OVERALLOC);
4964  settleListSize = (size_t)(st_tempListSize);
4965 
4966  st_tempListSize = (size_t)(rattleListSize);
4967  reallocate_device<CudaRattleElem>(rattleList, &st_tempListSize, nHG, OVERALLOC);
4968  rattleListSize = (size_t)(st_tempListSize);
4969 
4970  st_tempListSize = (size_t)(consFailureSize);
4971  reallocate_device<int>(consFailure, &st_tempListSize, nHG, OVERALLOC);
4972  consFailureSize = (size_t)(st_tempListSize);
4973 
4974  st_tempListSize = (size_t)(rattleIndexes_size);
4975  reallocate_device<int>(&rattleIndexes, &st_tempListSize, nHG, OVERALLOC);
4976  rattleIndexes_size = (size_t)(st_tempListSize);
4977 
4978  // Flagging arrays
4979  cudaCheck(cudaMemsetAsync(*rattleList, 0, sizeof(CudaRattleElem)*nHG, stream));
4980  cudaCheck(cudaMemsetAsync(rattleIndexes,-1, sizeof(int)*nHG, stream));
4981  cudaCheck(cudaMemsetAsync(*settleList, 0, sizeof(int)*nHG, stream));
4982 
4983  ///thrust::device_vector<int> hgi(natoms);
4984  new_storage_bytes = 0;
4985  if (doFixedAtoms) {
4986  isWater<true> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
4987  // builds SettleList with isWater functor
4988  cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
4989  d_nSettles, nHG, comp, stream);
4990  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4991  new_storage_bytes = temp_storage_bytes;
4992  cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
4993  d_nSettles, nHG, comp, stream);
4994  } else {
4995  isWater<false> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
4996  // builds SettleList with isWater functor
4997  cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
4998  d_nSettles, nHG, comp, stream);
4999  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5000  new_storage_bytes = temp_storage_bytes;
5001  cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
5002  d_nSettles, nHG, comp, stream);
5003  }
5004 
5005  // alright I have the number of settles here
5006  copy_DtoH<int>(d_nSettles, &nSettles, 1, stream);
5007  // fprintf(stderr, "nSettles = %d", nSettles);
5008 
5009  //Warmup call to buildRattleParams
5010  buildRattleParams<true><<<128, 128, 0, stream>>>(nHG, dt, invdt,
5011  hgi, mass, hydrogenGroupSize,
5012  rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5013  cudaCheck(cudaGetLastError());
5014 
5015  // Removing -1 from rattleIndexes
5016  new_storage_bytes = 0;
5017  cub::DeviceSelect::If(NULL, new_storage_bytes,
5018  rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5019  reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5020  new_storage_bytes = temp_storage_bytes;
5021  cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes,
5022  rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5023  copy_DtoH<int>(d_nRattles, &nRattles, 1, stream);
5024 
5025  // Calculates rattleParams on rattleIndexes
5026  buildRattleParams<false><<<128, 128, 0, stream>>>(nRattles, dt, invdt,
5027  hgi, mass, hydrogenGroupSize,
5028  rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5029 
5030  cudaCheck(cudaGetLastError());
5031 }
5032 
5033 // XXX TODO: JM: Memory access pattern in this function is bad, since we have to
5034 // access memory by the hydrogen groups. Improve it later
5035 
5036 void SequencerCUDAKernel::rattle1(
5037  const bool doFixedAtoms,
5038  const bool doEnergy,
5039  const bool pressure, // pressure is only false for startRun1 and startRun2 right now
5040  const int numAtoms,
5041  const double dt,
5042  const double invdt,
5043  const double tol2,
5044  double *vel_x,
5045  double *vel_y,
5046  double *vel_z,
5047  double *pos_x,
5048  double *pos_y,
5049  double *pos_z,
5050  double *velNew_x,
5051  double *velNew_y,
5052  double *velNew_z,
5053  double *posNew_x,
5054  double *posNew_y,
5055  double *posNew_z,
5056  double *f_normal_x,
5057  double *f_normal_y,
5058  double *f_normal_z,
5059  const int *hydrogenGroupSize,
5060  const float *rigidBondLength,
5061  const float *mass,
5062  const int *atomFixed,
5063  int **settleList,
5064  size_t& settleListSize,
5065  int **consFailure_d,
5066  size_t& consFailureSize,
5067  CudaRattleElem **rattleList,
5068  size_t& rattleListSize,
5069  int *nSettle,
5070  int *nRattle,
5071  cudaTensor *virial,
5072  cudaTensor *h_virial,
5073  unsigned int* tbcatomic,
5074  int migration,
5075  SettleParameters *sp,
5076  bool first,
5077  int* consFailure,
5078  const WaterModel water_model,
5079  cudaStream_t stream)
5080  {
5081  //Calls the necessary functions to enforce rigid bonds
5082  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5083  int maxiter = 10;
5084  cudaCheck(cudaStreamSynchronize(stream));
5085  if(migration || !firstRattleDone){
5086  // I need to allocate the water positions here
5087  buildRattleLists(
5088  doFixedAtoms, numAtoms, dt, invdt, mass, hydrogenGroupSize,
5089  rigidBondLength, atomFixed, settleList, settleListSize,
5090  consFailure_d, consFailureSize, rattleList, rattleListSize,
5091  *nSettle, *nRattle,
5092  d_nHG, d_nSettles, d_nRattles, hgi, hgi_size,
5093  d_rattleList_temp_storage, temp_storage_bytes,
5094  rattleIndexes, rattleIndexes_size,
5095  first,stream);
5096  firstRattleDone = true;
5097  }
5098 
5099 #if 1
5100  // if (1){
5101  this->updateRigidArrays(
5102  doFixedAtoms, dt, atomFixed,
5103  vel_x, vel_y, vel_z,
5104  pos_x, pos_y, pos_z,
5105  velNew_x, velNew_y, velNew_z,
5106  posNew_x, posNew_y, posNew_z,
5107  numAtoms, stream);
5108 
5109  #if 1
5110  // if(*nSettle != 0){
5111  if(*nSettle != 0){
5112  Settle(doEnergy,
5113  numAtoms, dt, invdt, *nSettle,
5114  vel_x, vel_y, vel_z,
5115  pos_x, pos_y, pos_z,
5116  velNew_x, velNew_y, velNew_z,
5117  posNew_x, posNew_y, posNew_z,
5118  f_normal_x, f_normal_y, f_normal_z, virial, mass,
5119  hydrogenGroupSize, rigidBondLength, atomFixed,
5120  *settleList, sp, water_model, stream);
5121  }
5122 
5123 
5124  if(*nRattle != 0){
5125  MSHAKE_CUDA(
5126  doEnergy, doFixedAtoms, *rattleList, *nRattle, hydrogenGroupSize,
5127  atomFixed, pos_x, pos_y, pos_z,
5128  posNew_x, posNew_y, posNew_z,
5129  vel_x, vel_y, vel_z,
5130  velNew_x, velNew_y,velNew_z,
5131  f_normal_x, f_normal_y, f_normal_z, virial, mass,
5132  invdt, tol2, maxiter, *consFailure_d,
5133  consFailure, stream);
5134  }
5135  #else
5136  Rattle1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5137  dt, invdt, *nSettle,
5138  vel_x, vel_y, vel_z,
5139  pos_x, pos_y, pos_z,
5140  velNew_x, velNew_y, velNew_z,
5141  posNew_x, posNew_y, posNew_z,
5142  f_normal_x, f_normal_y, f_normal_z,
5143  virial, mass,
5144  hydrogenGroupSize, rigidBondLength, atomFixed,
5145  *settleList, sp,
5146  *rattleList, *nRattle, tol2, maxiter, *consFailure_d);
5147  #endif
5148 
5149 
5150  if(invdt == 0){
5151  // only for the first time step
5152  copy_DtoD<double>(posNew_x, pos_x, numAtoms, stream);
5153  copy_DtoD<double>(posNew_y, pos_y, numAtoms, stream);
5154  copy_DtoD<double>(posNew_z, pos_z, numAtoms, stream);
5155  cudaCheck(cudaStreamSynchronize(stream));
5156  }else if (pressure == 0){ // pressure is zero for the first timesteps
5157  copy_DtoD<double>(velNew_x, vel_x, numAtoms, stream);
5158  copy_DtoD<double>(velNew_y, vel_y, numAtoms, stream);
5159  copy_DtoD<double>(velNew_z, vel_z, numAtoms, stream);
5160  cudaCheck(cudaStreamSynchronize(stream));
5161  }else{
5162 #define CALL(DOENERGY, DOFIXED) \
5163  addRattleForce<DOENERGY, DOFIXED><<<grid, ATOM_BLOCKS, 0 ,stream>>>( \
5164  numAtoms, invdt, atomFixed, mass, \
5165  pos_x, pos_y, pos_z, \
5166  vel_x, vel_y, vel_z, \
5167  velNew_x, velNew_y, velNew_z, \
5168  f_normal_x, f_normal_y, f_normal_z, virial, h_virial, tbcatomic)
5169  if (doEnergy && doFixedAtoms) CALL(true, true);
5170  else if (doEnergy && !doFixedAtoms) CALL(true, false);
5171  else if (!doEnergy && doFixedAtoms) CALL(false, true);
5172  else if (!doEnergy && !doFixedAtoms) CALL(false, false);
5173  else NAMD_bug("addRattleForce was not called in SequencerCUDAKernel::rattle1!\n");
5174 #undef CALL
5175  }
5176  // }else{
5177 #else
5178  // Calls a fused kernel without any tensor update
5179  // let's calculate the number of settle blocks
5180  int nSettleBlocks = (*nSettle + 128 -1 ) / 128;
5181  int nShakeBlocks = (*nRattle + 128 -1 ) / 128;
5182  // int nTotalBlocks = (nSettleBlocks + nShakeBlocks);
5183  if (doFixedAtoms) NAMD_bug("CallRattle1Kernel does not support fixed atoms!\n");
5184  CallRattle1Kernel(
5185  numAtoms, dt, invdt, *nSettle,
5186  vel_x, vel_y, vel_z,
5187  pos_x, pos_y, pos_z,
5188  f_normal_x, f_normal_y, f_normal_z,
5189  mass, hydrogenGroupSize, rigidBondLength, atomFixed,
5190  *settleList, sp,
5191  *rattleList, *nRattle, tol2, maxiter, *consFailure_d, nSettleBlocks,
5192  nShakeBlocks, water_model, stream
5193  );
5194 #endif
5195  // }
5196 }
5197 
5198 
5199 template <bool T_NORMALIZED, bool T_DOENERGY>
5200 __global__ void eFieldKernel(
5201  const int numAtoms,
5202  const double3 eField,
5203  const double eFieldOmega,
5204  const double eFieldPhi,
5205  const double t,
5206  const Lattice lat,
5207  const char3* __restrict transform,
5208  const float* __restrict charges,
5209  const double* __restrict pos_x,
5210  const double* __restrict pos_y,
5211  const double* __restrict pos_z,
5212  double* __restrict f_normal_x,
5213  double* __restrict f_normal_y,
5214  double* __restrict f_normal_z,
5215  double3* __restrict d_extForce,
5216  cudaTensor* __restrict d_extVirial,
5217  double* __restrict d_extEnergy,
5218  double3* __restrict h_extForce,
5219  cudaTensor* __restrict h_extVirial,
5220  double* __restrict h_extEnergy,
5221  unsigned int* tbcatomic)
5222 {
5223  int i = threadIdx.x + blockIdx.x*blockDim.x;
5224  double3 eField1;
5225 
5226  // This can all be moved outside of the kernel
5227 
5228  double fx, fy, fz;
5229  double eCos = cos(eFieldOmega * t - eFieldPhi);
5230  double charge = charges[i];
5231  double3 r_extForce = {0, 0, 0};
5232  double r_extEnergy = 0;
5233  cudaTensor r_extVirial = {0};
5234  int totaltb = gridDim.x;
5235  __shared__ bool isLastBlockDone;
5236  eField1.x = eCos * eField.x;
5237  eField1.y = eCos * eField.y;
5238  eField1.z = eCos * eField.z;
5239 
5240  if(threadIdx.x == 0) {
5241  isLastBlockDone = 0;
5242  }
5243 
5244  __syncthreads();
5245 
5246  if(i < numAtoms){
5247  // This can be moved outside of the kernel
5248  if(T_NORMALIZED){
5249  eField1 = Vector(lat.a_r()* (Vector)eField1, lat.b_r()* (Vector)eField1, lat.c_r()* (Vector)eField1);
5250  }
5251 
5252  fx = charge * eField1.x;
5253  fy = charge * eField1.y;
5254  fz = charge * eField1.z;
5255 
5256  f_normal_x[i] += fx;
5257  f_normal_y[i] += fy;
5258  f_normal_z[i] += fz;
5259 
5260  if(T_DOENERGY){
5261  const char3 tr = transform[i];
5262  double3 vpos;
5263  vpos.x = pos_x[i];
5264  vpos.y = pos_y[i];
5265  vpos.z = pos_z[i];
5266 
5267  //all threads from block calculate their contribution to energy, extForce and extVirial
5268  vpos = lat.reverse_transform(vpos, tr);
5269  double3 o = lat.origin();
5270  r_extEnergy -= (fx * (vpos.x - o.x)) + (fy * (vpos.y - o.y)) + (fz * (vpos.z - o.z));
5271 
5272  if(!T_NORMALIZED){
5273  r_extForce.x = fx;
5274  r_extForce.y = fy;
5275  r_extForce.z = fz;
5276 
5277  // outer product to calculate a virial here
5278  r_extVirial.xx = fx * vpos.x;
5279  r_extVirial.xy = fx * vpos.y;
5280  r_extVirial.xz = fx * vpos.z;
5281  r_extVirial.yx = fy * vpos.x;
5282  r_extVirial.yy = fy * vpos.y;
5283  r_extVirial.yz = fy * vpos.z;
5284  r_extVirial.zx = fz * vpos.x;
5285  r_extVirial.zz = fz * vpos.z;
5286  r_extVirial.zy = fz * vpos.y;
5287  }
5288  }
5289  }
5290 
5291  __syncthreads();
5292  if(T_DOENERGY){
5293  // use cub to reduce energies
5294  typedef cub::BlockReduce<double, ATOM_BLOCKS> BlockReduce;
5295  __shared__ typename BlockReduce::TempStorage temp_storage;
5296  r_extEnergy = BlockReduce(temp_storage).Sum(r_extEnergy);
5297  __syncthreads();
5298  if(!T_NORMALIZED){
5299  // Do the remaining reductions
5300  // external force
5301  r_extForce.x = BlockReduce(temp_storage).Sum(r_extForce.x);
5302  __syncthreads();
5303  r_extForce.y = BlockReduce(temp_storage).Sum(r_extForce.y);
5304  __syncthreads();
5305  r_extForce.z = BlockReduce(temp_storage).Sum(r_extForce.z);
5306  __syncthreads();
5307  // external virial
5308  r_extVirial.xx = BlockReduce(temp_storage).Sum(r_extVirial.xx);
5309  __syncthreads();
5310  r_extVirial.xy = BlockReduce(temp_storage).Sum(r_extVirial.xy);
5311  __syncthreads();
5312  r_extVirial.xz = BlockReduce(temp_storage).Sum(r_extVirial.xz);
5313  __syncthreads();
5314  r_extVirial.yx = BlockReduce(temp_storage).Sum(r_extVirial.yx);
5315  __syncthreads();
5316  r_extVirial.yy = BlockReduce(temp_storage).Sum(r_extVirial.yy);
5317  __syncthreads();
5318  r_extVirial.yz = BlockReduce(temp_storage).Sum(r_extVirial.yz);
5319  __syncthreads();
5320  r_extVirial.zx = BlockReduce(temp_storage).Sum(r_extVirial.zx);
5321  __syncthreads();
5322  r_extVirial.zy = BlockReduce(temp_storage).Sum(r_extVirial.zy);
5323  __syncthreads();
5324  r_extVirial.zz = BlockReduce(temp_storage).Sum(r_extVirial.zz);
5325  __syncthreads();
5326  }
5327 
5328  // Cool now atomic add to gmem
5329  if(threadIdx.x == 0 ){
5330  const int bin = blockIdx.x % ATOMIC_BINS;
5331 
5332  // printf("adding %lf to d_extEnergy\n", r_extEnergy);
5333  atomicAdd(&d_extEnergy[bin], r_extEnergy);
5334 
5335  if(!T_NORMALIZED){
5336  // add force and virial as well
5337  atomicAdd(&d_extForce[bin].x, r_extForce.x);
5338  atomicAdd(&d_extForce[bin].y, r_extForce.y);
5339  atomicAdd(&d_extForce[bin].z, r_extForce.z);
5340 
5341  atomicAdd(&d_extVirial[bin].xx, r_extVirial.xx);
5342  atomicAdd(&d_extVirial[bin].xy, r_extVirial.xy);
5343  atomicAdd(&d_extVirial[bin].xz, r_extVirial.xz);
5344 
5345  atomicAdd(&d_extVirial[bin].yx, r_extVirial.yx);
5346  atomicAdd(&d_extVirial[bin].yy, r_extVirial.yy);
5347  atomicAdd(&d_extVirial[bin].yz, r_extVirial.yz);
5348 
5349  atomicAdd(&d_extVirial[bin].zx, r_extVirial.zx);
5350  atomicAdd(&d_extVirial[bin].zy, r_extVirial.zy);
5351  atomicAdd(&d_extVirial[bin].zz, r_extVirial.zz);
5352  }
5353 
5354  __threadfence();
5355  unsigned int value = atomicInc(&tbcatomic[2], totaltb);
5356  isLastBlockDone = (value == (totaltb -1));
5357 
5358  }
5359  __syncthreads();
5360  if(isLastBlockDone){
5361  if(threadIdx.x < ATOMIC_BINS){
5362  const int bin = threadIdx.x;
5363 
5364  double e = d_extEnergy[bin];
5365  double3 f;
5366  cudaTensor v;
5367  if(!T_NORMALIZED){
5368  f = d_extForce[bin];
5369  v = d_extVirial[bin];
5370  }
5371 
5372  // sets device scalars back to zero
5373  d_extEnergy[bin] = 0.0;
5374  if(!T_NORMALIZED){
5375  d_extForce[bin] = make_double3(0.0, 0.0, 0.0);
5376  zero_cudaTensor(d_extVirial[bin]);
5377  }
5378 
5379  if(ATOMIC_BINS > 1){
5380  typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
5381  typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
5382  __shared__ typename WarpReduce::TempStorage tempStorage;
5383  __shared__ typename WarpReduceT::TempStorage tempStorageT;
5384 
5385  e = WarpReduce(tempStorage).Sum(e);
5386  if(!T_NORMALIZED){
5387  f.x = WarpReduce(tempStorage).Sum(f.x);
5388  f.y = WarpReduce(tempStorage).Sum(f.y);
5389  f.z = WarpReduce(tempStorage).Sum(f.z);
5390  v = WarpReduceT(tempStorageT).Sum(v);
5391  }
5392  }
5393 
5394  if(threadIdx.x == 0){
5395  h_extEnergy[0] = e;
5396  if(!T_NORMALIZED){
5397  h_extForce[0] = f;
5398  h_extVirial[0] = v;
5399  }
5400 
5401  //resets atomic counter
5402  reset_atomic_counter(&(tbcatomic[2]));
5403  }
5404  }
5405  }
5406  }
5407 }
5408 
5409 // JM NOTE: Apply external electric field to every atom in the system equally
5410 void SequencerCUDAKernel::apply_Efield(
5411  const int numAtoms,
5412  const bool normalized,
5413  const bool doEnergy,
5414  const double3 eField,
5415  const double eFieldOmega,
5416  const double eFieldPhi,
5417  const double t,
5418  const Lattice lat,
5419  const char3* transform,
5420  const float* charges,
5421  const double* pos_x,
5422  const double* pos_y,
5423  const double* pos_z,
5424  double* f_normal_x,
5425  double* f_normal_y,
5426  double* f_normal_z,
5427  double3* d_extForce,
5428  cudaTensor* d_extVirial,
5429  double* d_extEnergy,
5430  double3* h_extForce,
5431  cudaTensor* h_extVirial,
5432  double* h_extEnergy,
5433  unsigned int* tbcatomic,
5434  cudaStream_t stream
5435 ){
5436  int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5437  if(normalized){
5438  if(doEnergy) {
5439  eFieldKernel<true, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5440  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5441  pos_x, pos_y, pos_z,
5442  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5443  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5444  }else{
5445  eFieldKernel<true, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5446  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5447  pos_x, pos_y, pos_z,
5448  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5449  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5450  }
5451  }else{
5452  if(doEnergy) {
5453  eFieldKernel<false, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5454  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5455  pos_x, pos_y, pos_z,
5456  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5457  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5458  }else{
5459  eFieldKernel<false, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5460  eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5461  pos_x, pos_y, pos_z,
5462  f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5463  d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5464  }
5465  }
5466 }
5467 
5468 SequencerCUDAKernel::SequencerCUDAKernel() {
5469  firstRattleDone = false;
5470  intConstInit = false;
5471 
5472  d_nHG = NULL;
5473  d_nSettles = NULL;
5474  d_nRattles = NULL;
5475 
5476  hgi = NULL;
5477  hgi_size = 0;
5478 
5479  d_rattleList_temp_storage = NULL;
5480  temp_storage_bytes = 0;
5481 
5482  rattleIndexes = NULL;
5483  rattleIndexes_size = 0;
5484 }
5485 
5486 SequencerCUDAKernel::~SequencerCUDAKernel() {
5487  deallocate_device<int>(&d_nHG);
5488  deallocate_device<int>(&d_nSettles);
5489  deallocate_device<int>(&d_nRattles);
5490  deallocate_device<int>(&hgi);
5491  deallocate_device<int>(&rattleIndexes);
5492  deallocate_device<char>(&d_rattleList_temp_storage);
5493 }
5494 
5495 
5496 
5497 #if 1
5498 
5499 
5500 
5501 
5502 void SequencerCUDAKernel::set_pme_positions(
5503  const int devID,
5504  const bool isPmeDevice,
5505  const int nDev,
5506  const int numPatchesHomeAndProxy,
5507  const int numPatchesHome,
5508  const bool doNbond,
5509  const bool doSlow,
5510  const bool doFEP,
5511  const bool doTI,
5512  const bool doAlchDecouple,
5513  const bool doAlchSoftCore,
5514  const bool handleBoundary,
5515  const double* d_pos_x,
5516  const double* d_pos_y,
5517  const double* d_pos_z,
5518 #ifndef NAMD_NCCL_ALLREDUCE
5519  double** d_peer_pos_x,
5520  double** d_peer_pos_y,
5521  double** d_peer_pos_z,
5522  float** d_peer_charge,
5523  int** d_peer_partition,
5524 #endif
5525  const float* charges,
5526  const int* partition,
5527  const double charge_scaling,
5528  const double3* patchCenter,
5529  const int* s_patchPositions,
5530  const int* s_pencilPatchIndex,
5531  const int* s_patchIDs,
5532  const int* patchSortOrder,
5533  const Lattice lattice,
5534  float4* nb_atoms,
5535  float4* b_atoms,
5536  float4* s_atoms,
5537  int* s_partition,
5538  int numTotalAtoms,
5539  CudaLocalRecord* localRecords,
5540  CudaPeerRecord* peerRecords,
5541  std::vector<int>& atomCounts,
5542  cudaStream_t stream
5543 ) {
5544  // Launch PME setComputes
5545  #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
5546  setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
5547  <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
5548  d_pos_x, d_pos_y, d_pos_z, charges, \
5549  d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
5550  d_peer_partition, partition, charge_scaling, \
5551  s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
5552  lattice, s_atoms, numTotalAtoms, s_partition, \
5553  i, numAtoms, offset, handleBoundary \
5554  );
5555  // Only when PME long-range electrostaic is enabled (doSlow is true)
5556  // The partition is needed for alchemical calculation.
5557  if(doSlow && isPmeDevice) {
5558  int offset = 0;
5559  for (int i = 0; i < nDev; i++) {
5560  const bool home = (i == devID);
5561  const int numAtoms = atomCounts[i];
5562  const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
5563  const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
5564 
5565  switch (options) {
5566  case 0: CALL(0, 0, 0, 0, 0); break;
5567  case 1: CALL(0, 0, 0, 0, 1); break;
5568  case 2: CALL(0, 0, 0, 1, 0); break;
5569  case 3: CALL(0, 0, 0, 1, 1); break;
5570  case 4: CALL(0, 0, 1, 0, 0); break;
5571  case 5: CALL(0, 0, 1, 0, 1); break;
5572  case 6: CALL(0, 0, 1, 1, 0); break;
5573  case 7: CALL(0, 0, 1, 1, 1); break;
5574  case 8: CALL(0, 1, 0, 0, 0); break;
5575  case 9: CALL(0, 1, 0, 0, 1); break;
5576  case 10: CALL(0, 1, 0, 1, 0); break;
5577  case 11: CALL(0, 1, 0, 1, 1); break;
5578  case 12: CALL(0, 1, 1, 0, 0); break;
5579  case 13: CALL(0, 1, 1, 0, 1); break;
5580  case 14: CALL(0, 1, 1, 1, 0); break;
5581  case 15: CALL(0, 1, 1, 1, 1); break;
5582  case 16: CALL(1, 0, 0, 0, 0); break;
5583  case 17: CALL(1, 0, 0, 0, 1); break;
5584  case 18: CALL(1, 0, 0, 1, 0); break;
5585  case 19: CALL(1, 0, 0, 1, 1); break;
5586  case 20: CALL(1, 0, 1, 0, 0); break;
5587  case 21: CALL(1, 0, 1, 0, 1); break;
5588  case 22: CALL(1, 0, 1, 1, 0); break;
5589  case 23: CALL(1, 0, 1, 1, 1); break;
5590  case 24: CALL(1, 1, 0, 0, 0); break;
5591  case 25: CALL(1, 1, 0, 0, 1); break;
5592  case 26: CALL(1, 1, 0, 1, 0); break;
5593  case 27: CALL(1, 1, 0, 1, 1); break;
5594  case 28: CALL(1, 1, 1, 0, 0); break;
5595  case 29: CALL(1, 1, 1, 0, 1); break;
5596  case 30: CALL(1, 1, 1, 1, 0); break;
5597  case 31: CALL(1, 1, 1, 1, 1); break;
5598  default:
5599  NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
5600  }
5601 
5602  offset += numAtoms;
5603  }
5604  }
5605  #undef CALL
5606 }
5607 
5608 template <bool doGlobal, bool doForcesOutput>
5609 __global__ void copyForcesToHostSOAKernel(
5610  const int numPatches,
5611  CudaLocalRecord* localRecords,
5612  const int maxForceNumber,
5613  const double* d_f_normal_x,
5614  const double* d_f_normal_y,
5615  const double* d_f_normal_z,
5616  const double* d_f_nbond_x,
5617  const double* d_f_nbond_y,
5618  const double* d_f_nbond_z,
5619  const double* d_f_slow_x,
5620  const double* d_f_slow_y,
5621  const double* d_f_slow_z,
5622  PatchDataSOA* d_HostPatchDataSOA
5623 ) {
5624  __shared__ CudaLocalRecord s_record;
5625  using AccessType = int32_t;
5626  AccessType* s_record_buffer = (AccessType*) &s_record;
5627 
5628  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5629  // Read in the CudaLocalRecord using multiple threads. This should
5630  #pragma unroll 1
5631  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5632  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5633  }
5634  __syncthreads();
5635 
5636  const int numAtoms = s_record.numAtoms;
5637  const int offset = s_record.bufferOffset;
5638 
5639  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5640  if (maxForceNumber >= 0) {
5641  d_HostPatchDataSOA[patchIndex].f_normal_x[i] = d_f_normal_x[offset + i];
5642  d_HostPatchDataSOA[patchIndex].f_normal_y[i] = d_f_normal_y[offset + i];
5643  d_HostPatchDataSOA[patchIndex].f_normal_z[i] = d_f_normal_z[offset + i];
5644  }
5645  if (maxForceNumber >= 1) {
5646  if (doGlobal) {
5647  d_HostPatchDataSOA[patchIndex].f_saved_nbond_x[i] = d_f_nbond_x[offset + i];
5648  d_HostPatchDataSOA[patchIndex].f_saved_nbond_y[i] = d_f_nbond_y[offset + i];
5649  d_HostPatchDataSOA[patchIndex].f_saved_nbond_z[i] = d_f_nbond_z[offset + i];
5650  }
5651  if (doForcesOutput) {
5652  d_HostPatchDataSOA[patchIndex].f_nbond_x[i] = d_f_nbond_x[offset + i];
5653  d_HostPatchDataSOA[patchIndex].f_nbond_y[i] = d_f_nbond_y[offset + i];
5654  d_HostPatchDataSOA[patchIndex].f_nbond_z[i] = d_f_nbond_z[offset + i];
5655  }
5656  }
5657  if (maxForceNumber >= 2) {
5658  if (doGlobal) {
5659  d_HostPatchDataSOA[patchIndex].f_saved_slow_x[i] = d_f_slow_x[offset + i];
5660  d_HostPatchDataSOA[patchIndex].f_saved_slow_y[i] = d_f_slow_y[offset + i];
5661  d_HostPatchDataSOA[patchIndex].f_saved_slow_z[i] = d_f_slow_z[offset + i];
5662  }
5663  if (doForcesOutput) {
5664  d_HostPatchDataSOA[patchIndex].f_slow_x[i] = d_f_slow_x[offset + i];
5665  d_HostPatchDataSOA[patchIndex].f_slow_y[i] = d_f_slow_y[offset + i];
5666  d_HostPatchDataSOA[patchIndex].f_slow_z[i] = d_f_slow_z[offset + i];
5667  }
5668  }
5669 
5670  }
5671  __syncthreads();
5672  }
5673 }
5674 
5675 void SequencerCUDAKernel::copyForcesToHostSOA(
5676  const int numPatches,
5677  CudaLocalRecord* localRecords,
5678  const int maxForceNumber,
5679  const double* d_f_normal_x,
5680  const double* d_f_normal_y,
5681  const double* d_f_normal_z,
5682  const double* d_f_nbond_x,
5683  const double* d_f_nbond_y,
5684  const double* d_f_nbond_z,
5685  const double* d_f_slow_x,
5686  const double* d_f_slow_y,
5687  const double* d_f_slow_z,
5688  PatchDataSOA* d_HostPatchDataSOA,
5689  const bool doGlobal,
5690  const bool doForcesOutput,
5691  cudaStream_t stream
5692 ) {
5693 #define CALL(GLOBAL, FORCESOUTPUT) \
5694  do {copyForcesToHostSOAKernel<GLOBAL, FORCESOUTPUT><<<numPatches, PATCH_BLOCKS, 0, stream>>>( \
5695  numPatches, \
5696  localRecords, \
5697  maxForceNumber, \
5698  d_f_normal_x, \
5699  d_f_normal_y, \
5700  d_f_normal_z, \
5701  d_f_nbond_x, \
5702  d_f_nbond_y, \
5703  d_f_nbond_z, \
5704  d_f_slow_x, \
5705  d_f_slow_y, \
5706  d_f_slow_z, \
5707  d_HostPatchDataSOA \
5708  );} while (0)
5709  if (doGlobal && !doForcesOutput) CALL(true, false);
5710  else if (doGlobal && doForcesOutput) CALL(true, true);
5711  else if (!doGlobal && doForcesOutput) CALL(false, true);
5712  else NAMD_bug("No kernel is called in copyForcesToHostSOA");
5713 #undef CALL
5714 }
5715 
5716 __global__ void copyPositionsToHostSOAKernel(
5717  const int numPatches,
5718  CudaLocalRecord* localRecords,
5719  const double* pos_x,
5720  const double* pos_y,
5721  const double* pos_z,
5722  PatchDataSOA* d_HostPatchDataSOA
5723 ) {
5724  __shared__ CudaLocalRecord s_record;
5725  using AccessType = int32_t;
5726  AccessType* s_record_buffer = (AccessType*) &s_record;
5727 
5728  for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5729  // Read in the CudaLocalRecord using multiple threads. This should
5730  #pragma unroll 1
5731  for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5732  s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5733  }
5734  __syncthreads();
5735 
5736  const int numAtoms = s_record.numAtoms;
5737  const int offset = s_record.bufferOffset;
5738 
5739  for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5740  d_HostPatchDataSOA[patchIndex].pos_x[i] = pos_x[offset + i];
5741  d_HostPatchDataSOA[patchIndex].pos_y[i] = pos_y[offset + i];
5742  d_HostPatchDataSOA[patchIndex].pos_z[i] = pos_z[offset + i];
5743  }
5744  __syncthreads();
5745  }
5746 }
5747 
5748 void SequencerCUDAKernel::copyPositionsToHostSOA(
5749  const int numPatches,
5750  CudaLocalRecord* localRecords,
5751  const double* pos_x,
5752  const double* pos_y,
5753  const double* pos_z,
5754  PatchDataSOA* d_HostPatchDataSOA,
5755  cudaStream_t stream
5756 ) {
5757  copyPositionsToHostSOAKernel<<<numPatches, PATCH_BLOCKS, 0, stream>>>(
5758  numPatches,
5759  localRecords,
5760  pos_x, pos_y, pos_z,
5761  d_HostPatchDataSOA
5762  );
5763 }
5764 
5765 // Swipe from HomePatch::redistrib_lp_water_force
5766 /* Redistribute forces from the massless lonepair charge particle onto
5767  * the other atoms of the water.
5768  *
5769  * This is done using the same algorithm as charmm uses for TIP4P lonepairs.
5770  *
5771  * Pass by reference the forces (O H1 H2 LP) to be modified,
5772  * pass by constant reference the corresponding positions.
5773  */
5774 template <bool doVirial>
5775 __device__ void redistrib_lp_water_force(
5776  double f_ox[3], double f_h1[3], double f_h2[3], double f_lp[3],
5777  const double p_ox[3], const double p_h1[3], const double p_h2[3],
5778  const double p_lp[3], cudaTensor& virial) {
5779  // Accumulate force adjustments
5780  double fad_ox[3] = {};
5781  double fad_h[3] = {};
5782  // Calculate the radial component of the force and add it to the oxygen
5783  const double r_ox_lp[3] = {p_lp[0] - p_ox[0],
5784  p_lp[1] - p_ox[1],
5785  p_lp[2] - p_ox[2]};
5786  double invlen2_r_ox_lp = rnorm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5787  invlen2_r_ox_lp *= invlen2_r_ox_lp;
5788  const double rad_factor =
5789  (f_lp[0] * r_ox_lp[0] + f_lp[1] * r_ox_lp[1] + f_lp[2] * r_ox_lp[2]) * invlen2_r_ox_lp;
5790  const double f_rad[3] = {r_ox_lp[0] * rad_factor,
5791  r_ox_lp[1] * rad_factor,
5792  r_ox_lp[2] * rad_factor};
5793  fad_ox[0] += f_rad[0];
5794  fad_ox[1] += f_rad[1];
5795  fad_ox[2] += f_rad[2];
5796  // Calculate the angular component
5797  const double r_hcom_ox[3] = {
5798  p_ox[0] - ( (p_h1[0] + p_h2[0]) * 0.5 ),
5799  p_ox[1] - ( (p_h1[1] + p_h2[1]) * 0.5 ),
5800  p_ox[2] - ( (p_h1[2] + p_h2[2]) * 0.5 )};
5801  const double f_ang[3] = {
5802  f_lp[0] - f_rad[0],
5803  f_lp[1] - f_rad[1],
5804  f_lp[2] - f_rad[2]};
5805  // now split this component onto the other atoms
5806  const double len_r_ox_lp = norm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5807  const double invlen_r_hcom_ox = rnorm3d(
5808  r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]);
5809  const double oxcomp =
5810  (norm3d(r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]) - len_r_ox_lp) *
5811  invlen_r_hcom_ox;
5812  const double hydcomp = 0.5 * len_r_ox_lp * invlen_r_hcom_ox;
5813  fad_ox[0] += (f_ang[0] * oxcomp);
5814  fad_ox[1] += (f_ang[1] * oxcomp);
5815  fad_ox[2] += (f_ang[2] * oxcomp);
5816  fad_h[0] += (f_ang[0] * hydcomp);
5817  fad_h[1] += (f_ang[1] * hydcomp);
5818  fad_h[2] += (f_ang[2] * hydcomp);
5819  if (doVirial) {
5820  virial.xx = fad_ox[0] * p_ox[0] + fad_h[0] * p_h1[0] + fad_h[0] * p_h2[0] - f_lp[0] * p_lp[0];
5821  virial.xy = fad_ox[0] * p_ox[1] + fad_h[0] * p_h1[1] + fad_h[0] * p_h2[1] - f_lp[0] * p_lp[1];
5822  virial.xz = fad_ox[0] * p_ox[2] + fad_h[0] * p_h1[2] + fad_h[0] * p_h2[2] - f_lp[0] * p_lp[2];
5823  virial.yx = fad_ox[1] * p_ox[0] + fad_h[1] * p_h1[0] + fad_h[1] * p_h2[0] - f_lp[1] * p_lp[0];
5824  virial.yy = fad_ox[1] * p_ox[1] + fad_h[1] * p_h1[1] + fad_h[1] * p_h2[1] - f_lp[1] * p_lp[1];
5825  virial.yz = fad_ox[1] * p_ox[2] + fad_h[1] * p_h1[2] + fad_h[1] * p_h2[2] - f_lp[1] * p_lp[2];
5826  virial.zx = fad_ox[2] * p_ox[0] + fad_h[2] * p_h1[0] + fad_h[2] * p_h2[0] - f_lp[2] * p_lp[0];
5827  virial.zy = fad_ox[2] * p_ox[1] + fad_h[2] * p_h1[1] + fad_h[2] * p_h2[1] - f_lp[2] * p_lp[1];
5828  virial.zz = fad_ox[2] * p_ox[2] + fad_h[2] * p_h1[2] + fad_h[2] * p_h2[2] - f_lp[2] * p_lp[2];
5829  }
5830  f_lp[0] = 0;
5831  f_lp[1] = 0;
5832  f_lp[2] = 0;
5833  f_ox[0] += fad_ox[0];
5834  f_ox[1] += fad_ox[1];
5835  f_ox[2] += fad_ox[2];
5836  f_h1[0] += fad_h[0];
5837  f_h1[1] += fad_h[1];
5838  f_h1[2] += fad_h[2];
5839  f_h2[0] += fad_h[0];
5840  f_h2[1] += fad_h[1];
5841  f_h2[2] += fad_h[2];
5842 }
5843 
5844 template <bool doVirial>
5845 __global__ void redistributeTip4pForcesKernel2(
5846  double* d_f_x,
5847  double* d_f_y,
5848  double* d_f_z,
5849  cudaTensor* d_virial,
5850  const double* d_pos_x,
5851  const double* d_pos_y,
5852  const double* d_pos_z,
5853  const float* d_mass,
5854  const int numAtoms)
5855 {
5856  const int i = threadIdx.x + blockIdx.x * blockDim.x;
5857  cudaTensor lVirial = {0};
5858  if (i < numAtoms) {
5859  if (d_mass[i] < 0.01f) {
5860  double f_ox[3] = {d_f_x[i-3], d_f_y[i-3], d_f_z[i-3]};
5861  double f_h1[3] = {d_f_x[i-2], d_f_y[i-2], d_f_z[i-2]};
5862  double f_h2[3] = {d_f_x[i-1], d_f_y[i-1], d_f_z[i-1]};
5863  double f_lp[3] = {d_f_x[i], d_f_y[i], d_f_z[i]};
5864  const double p_ox[3] = {d_pos_x[i-3], d_pos_y[i-3], d_pos_z[i-3]};
5865  const double p_h1[3] = {d_pos_x[i-2], d_pos_y[i-2], d_pos_z[i-2]};
5866  const double p_h2[3] = {d_pos_x[i-1], d_pos_y[i-1], d_pos_z[i-1]};
5867  const double p_lp[3] = {d_pos_x[i], d_pos_y[i], d_pos_z[i]};
5868  redistrib_lp_water_force<doVirial>(
5869  f_ox, f_h1, f_h2, f_lp, p_ox, p_h1, p_h2, p_lp, lVirial);
5870  // copy the force back
5871  d_f_x[i-3] = f_ox[0];
5872  d_f_x[i-2] = f_h1[0];
5873  d_f_x[i-1] = f_h2[0];
5874  d_f_x[i] = f_lp[0];
5875  d_f_y[i-3] = f_ox[1];
5876  d_f_y[i-2] = f_h1[1];
5877  d_f_y[i-1] = f_h2[1];
5878  d_f_y[i] = f_lp[1];
5879  d_f_z[i-3] = f_ox[2];
5880  d_f_z[i-2] = f_h1[2];
5881  d_f_z[i-1] = f_h2[2];
5882  d_f_z[i] = f_lp[2];
5883  }
5884  }
5885  typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
5886  __shared__ typename BlockReduce::TempStorage temp_storage;
5887  if (doVirial) {
5888  lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx);
5889  __syncthreads();
5890  lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy);
5891  __syncthreads();
5892  lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz);
5893  __syncthreads();
5894  lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx);
5895  __syncthreads();
5896  lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
5897  __syncthreads();
5898  lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz);
5899  __syncthreads();
5900  lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx);
5901  __syncthreads();
5902  lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy);
5903  __syncthreads();
5904  lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz);
5905  __syncthreads();
5906  if (threadIdx.x == 0) {
5907  atomicAdd(&(d_virial->xx), lVirial.xx);
5908  atomicAdd(&(d_virial->xy), lVirial.xy);
5909  atomicAdd(&(d_virial->xz), lVirial.xz);
5910  atomicAdd(&(d_virial->yx), lVirial.yx);
5911  atomicAdd(&(d_virial->yy), lVirial.yy);
5912  atomicAdd(&(d_virial->yz), lVirial.yz);
5913  atomicAdd(&(d_virial->zx), lVirial.zx);
5914  atomicAdd(&(d_virial->zy), lVirial.zy);
5915  atomicAdd(&(d_virial->zz), lVirial.zz);
5916  }
5917  }
5918 }
5919 
5920 void SequencerCUDAKernel::redistributeTip4pForces(
5921  double* d_f_normal_x,
5922  double* d_f_normal_y,
5923  double* d_f_normal_z,
5924  double* d_f_nbond_x,
5925  double* d_f_nbond_y,
5926  double* d_f_nbond_z,
5927  double* d_f_slow_x,
5928  double* d_f_slow_y,
5929  double* d_f_slow_z,
5930  cudaTensor* d_virial_normal,
5931  cudaTensor* d_virial_nbond,
5932  cudaTensor* d_virial_slow,
5933  const double* d_pos_x,
5934  const double* d_pos_y,
5935  const double* d_pos_z,
5936  const float* d_mass,
5937  const int numAtoms,
5938  const int doVirial,
5939  const int maxForceNumber,
5940  cudaStream_t stream
5941 ) {
5942  const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5943  switch (maxForceNumber) {
5944  case 2:
5945  if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5946  else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5947  case 1:
5948  if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5949  else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5950  case 0:
5951  if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5952  else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5953  }
5954 }
5955 
5956 __forceinline__ __device__ void calcVirial(
5957  const int i, const double factor,
5958  const double* __restrict f_x, const double* __restrict f_y, const double* __restrict f_z,
5959  const double* __restrict r_x, const double* __restrict r_y, const double* __restrict r_z,
5960  cudaTensor& tensor_out) {
5961  tensor_out.xx = factor * f_x[i] * r_x[i];
5962  tensor_out.xy = factor * f_x[i] * r_y[i];
5963  tensor_out.xz = factor * f_x[i] * r_z[i];
5964  tensor_out.yx = factor * f_y[i] * r_x[i];
5965  tensor_out.yy = factor * f_y[i] * r_y[i];
5966  tensor_out.yz = factor * f_y[i] * r_z[i];
5967  tensor_out.zx = factor * f_z[i] * r_x[i];
5968  tensor_out.zy = factor * f_z[i] * r_y[i];
5969  tensor_out.zz = factor * f_z[i] * r_z[i];
5970 }
5971 
5972 template <bool doNbond, bool doSlow>
5973 __global__ void calcFixVirialKernel(
5974  const int numAtoms,
5975  const int* __restrict d_atomFixed,
5976  const double* __restrict d_fixedPosition_x,
5977  const double* __restrict d_fixedPosition_y,
5978  const double* __restrict d_fixedPosition_z,
5979  const double* __restrict d_f_normal_x,
5980  const double* __restrict d_f_normal_y,
5981  const double* __restrict d_f_normal_z,
5982  const double* __restrict d_f_nbond_x,
5983  const double* __restrict d_f_nbond_y,
5984  const double* __restrict d_f_nbond_z,
5985  const double* __restrict d_f_slow_x,
5986  const double* __restrict d_f_slow_y,
5987  const double* __restrict d_f_slow_z,
5988  cudaTensor* __restrict d_virial_normal,
5989  cudaTensor* __restrict d_virial_nbond,
5990  cudaTensor* __restrict d_virial_slow,
5991  double3* __restrict d_extForce_normal,
5992  double3* __restrict d_extForce_nbond,
5993  double3* __restrict d_extForce_slow)
5994 {
5995  const int i = threadIdx.x + blockIdx.x * blockDim.x;
5996  cudaTensor vir_normal = {0};
5997  cudaTensor vir_nbond = {0};
5998  cudaTensor vir_slow = {0};
5999  double3 f_sum_normal = {0, 0, 0};
6000  double3 f_sum_nbond;
6001  if (doNbond) {
6002  f_sum_nbond = {0, 0, 0};
6003  }
6004  double3 f_sum_slow;
6005  if (doSlow) {
6006  f_sum_slow = {0, 0, 0};
6007  }
6008  if (i < numAtoms) {
6009  if (d_atomFixed[i]) {
6010  calcVirial(i, -1.0, d_f_normal_x, d_f_normal_y, d_f_normal_z,
6011  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_normal);
6012  f_sum_normal.x = -1.0 * d_f_normal_x[i];
6013  f_sum_normal.y = -1.0 * d_f_normal_y[i];
6014  f_sum_normal.z = -1.0 * d_f_normal_z[i];
6015  if (doNbond) {
6016  calcVirial(i, -1.0, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6017  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_nbond);
6018  f_sum_nbond.x = -1.0 * d_f_nbond_x[i];
6019  f_sum_nbond.y = -1.0 * d_f_nbond_y[i];
6020  f_sum_nbond.z = -1.0 * d_f_nbond_z[i];
6021  }
6022  if (doSlow) {
6023  calcVirial(i, -1.0, d_f_slow_x, d_f_slow_y, d_f_slow_z,
6024  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_slow);
6025  f_sum_slow.x = -1.0 * d_f_slow_x[i];
6026  f_sum_slow.y = -1.0 * d_f_slow_y[i];
6027  f_sum_slow.z = -1.0 * d_f_slow_z[i];
6028  }
6029  }
6030  }
6031  // Haochuan: is it OK to use lambda to reduce the cudaTensor as a whole instead of component-wise??
6032  typedef cub::BlockReduce<cudaTensor, ATOM_BLOCKS> BlockReduceCudaTensor;
6033  __shared__ typename BlockReduceCudaTensor::TempStorage temp_storage;
6034  vir_normal = BlockReduceCudaTensor(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6035  __syncthreads();
6036  if (doNbond) {
6037  vir_nbond = BlockReduceCudaTensor(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6038  __syncthreads();
6039  }
6040  if (doSlow) {
6041  vir_slow = BlockReduceCudaTensor(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6042  __syncthreads();
6043  }
6044  typedef cub::BlockReduce<double3, ATOM_BLOCKS> BlockReduceDouble3;
6045  __shared__ typename BlockReduceDouble3::TempStorage temp_storage2;
6046  f_sum_normal = BlockReduceDouble3(temp_storage2).Reduce(f_sum_normal, [](const double3& a, const double3& b){return a+b;});
6047  __syncthreads();
6048  if (doNbond) {
6049  f_sum_nbond = BlockReduceDouble3(temp_storage2).Reduce(f_sum_nbond, [](const double3& a, const double3& b){return a+b;});
6050  __syncthreads();
6051  }
6052  if (doSlow) {
6053  f_sum_slow = BlockReduceDouble3(temp_storage2).Reduce(f_sum_slow, [](const double3& a, const double3& b){return a+b;});
6054  __syncthreads();
6055  }
6056  if (threadIdx.x == 0) {
6057  atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
6058  atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
6059  atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
6060  atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
6061  atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
6062  atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
6063  atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
6064  atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
6065  atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
6066  atomicAdd(&(d_extForce_normal->x), f_sum_normal.x);
6067  atomicAdd(&(d_extForce_normal->y), f_sum_normal.y);
6068  atomicAdd(&(d_extForce_normal->z), f_sum_normal.z);
6069  if (doNbond) {
6070  atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
6071  atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
6072  atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
6073  atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
6074  atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
6075  atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
6076  atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
6077  atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
6078  atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
6079  atomicAdd(&(d_extForce_nbond->x), f_sum_nbond.x);
6080  atomicAdd(&(d_extForce_nbond->y), f_sum_nbond.y);
6081  atomicAdd(&(d_extForce_nbond->z), f_sum_nbond.z);
6082  }
6083  if (doSlow) {
6084  atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
6085  atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
6086  atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
6087  atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
6088  atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
6089  atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
6090  atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
6091  atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
6092  atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
6093  atomicAdd(&(d_extForce_slow->x), f_sum_slow.x);
6094  atomicAdd(&(d_extForce_slow->y), f_sum_slow.y);
6095  atomicAdd(&(d_extForce_slow->z), f_sum_slow.z);
6096  }
6097  }
6098 }
6099 
6100 __global__ void copyForcesToDeviceKernel(
6101  const int numAtoms,
6102  const double* d_f_nbond_x,
6103  const double* d_f_nbond_y,
6104  const double* d_f_nbond_z,
6105  const double* d_f_slow_x,
6106  const double* d_f_slow_y,
6107  const double* d_f_slow_z,
6108  double* d_f_saved_nbond_x,
6109  double* d_f_saved_nbond_y,
6110  double* d_f_saved_nbond_z,
6111  double* d_f_saved_slow_x,
6112  double* d_f_saved_slow_y,
6113  double* d_f_saved_slow_z,
6114  const int maxForceNumber
6115 ) {
6116  const int i = threadIdx.x + blockIdx.x * blockDim.x;
6117  if (i < numAtoms) {
6118  if (maxForceNumber >= 2) {
6119  d_f_saved_slow_x[i] = d_f_slow_x[i];
6120  d_f_saved_slow_y[i] = d_f_slow_y[i];
6121  d_f_saved_slow_z[i] = d_f_slow_z[i];
6122  }
6123  if (maxForceNumber >= 1) {
6124  d_f_saved_nbond_x[i] = d_f_nbond_x[i];
6125  d_f_saved_nbond_y[i] = d_f_nbond_y[i];
6126  d_f_saved_nbond_z[i] = d_f_nbond_z[i];
6127  }
6128  }
6129 }
6130 
6131 void SequencerCUDAKernel::calcFixVirial(
6132  const int maxForceNumber,
6133  const int numAtoms,
6134  const int* d_atomFixed,
6135  const double* d_fixedPosition_x,
6136  const double* d_fixedPosition_y,
6137  const double* d_fixedPosition_z,
6138  const double* d_f_normal_x,
6139  const double* d_f_normal_y,
6140  const double* d_f_normal_z,
6141  const double* d_f_nbond_x,
6142  const double* d_f_nbond_y,
6143  const double* d_f_nbond_z,
6144  const double* d_f_slow_x,
6145  const double* d_f_slow_y,
6146  const double* d_f_slow_z,
6147  cudaTensor* d_virial_normal,
6148  cudaTensor* d_virial_nbond,
6149  cudaTensor* d_virial_slow,
6150  double3* d_extForce_normal,
6151  double3* d_extForce_nbond,
6152  double3* d_extForce_slow,
6153  cudaStream_t stream)
6154 {
6155  const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6156 #define CALL(doNbond, doSlow) \
6157  calcFixVirialKernel<doNbond, doSlow><<<grid, ATOM_BLOCKS, 0, stream>>>( \
6158  numAtoms, d_atomFixed, \
6159  d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, \
6160  d_f_normal_x, d_f_normal_y, d_f_normal_z, \
6161  d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
6162  d_f_slow_x, d_f_slow_y, d_f_slow_z, \
6163  d_virial_normal, d_virial_nbond, d_virial_slow, \
6164  d_extForce_normal, d_extForce_nbond, d_extForce_slow \
6165  );
6166  switch (maxForceNumber) {
6167  case 0: CALL(false, false); break;
6168  case 1: CALL(true, false); break;
6169  case 2: CALL(true, true); break;
6170  default: NAMD_bug("No kernel is called in SequencerCUDAKernel::calcFixVirial!\n");
6171  }
6172 #undef CALL
6173 }
6174 
6175 void SequencerCUDAKernel::copyForcesToDevice(
6176  const int numAtoms,
6177  const double* d_f_nbond_x,
6178  const double* d_f_nbond_y,
6179  const double* d_f_nbond_z,
6180  const double* d_f_slow_x,
6181  const double* d_f_slow_y,
6182  const double* d_f_slow_z,
6183  double* d_f_saved_nbond_x,
6184  double* d_f_saved_nbond_y,
6185  double* d_f_saved_nbond_z,
6186  double* d_f_saved_slow_x,
6187  double* d_f_saved_slow_y,
6188  double* d_f_saved_slow_z,
6189  const int maxForceNumber,
6190  cudaStream_t stream
6191 ) {
6192  const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6193  copyForcesToDeviceKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
6194  numAtoms, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6195  d_f_slow_x, d_f_slow_y, d_f_slow_z,
6196  d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
6197  d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z,
6198  maxForceNumber
6199  );
6200 }
6201 
6202 #endif
6203 #endif // NODEGROUP_FORCE_REGISTER